logger.info("Using decode attention configuration from %s for attention layer.",config_file_path)
# If a configuration has been found, return it
returnjson.load(f)
else:
logger.warning("Can not find best decode attention configuration %s for attention layer, it may not have the best performance to use default json. Please tune one. ",config_file_path)
logger.warning("Using default decode attention configuration from %s for attention layer. It may not have the best performance to use default json. ",config_file_path)
# If a configuration has been found, return it
returnjson.load(f)
else:
raiseValueError("Please surpport default config can match 16 1 576 512")
# If no optimized configuration is available, we will use the default
# configuration
returnNone
classTritonMLABackend(AttentionBackend):
classTritonMLABackend(AttentionBackend):
...
@@ -736,11 +790,14 @@ class TritonMLAImpl(MLACommonImpl[TritonMLAMetadata]):
...
@@ -736,11 +790,14 @@ class TritonMLAImpl(MLACommonImpl[TritonMLAMetadata]):