Add non-persistent fp8 triton_rowwise kernel (#2484)

Summary: Pull Request resolved: #2484 X-link: pytorch/FBGEMM#3212 X-link: facebookresearch/FBGEMM#308 triton_rowwise persistent kernel performs poorly on MI300 compared to the non-persistent kernel, when both are run with exhaustive AMD-specific tuning. Reviewed By: htyu Differential Revision: D63741099 fbshipit-source-id: c276415ddf8f5d24ffeba70b8ee6493011b393e1
pytorch · Oct 3, 2024 · 6b4f339 · 6b4f339
1 parent 737084e
commit 6b4f339
Showing 1 changed file with 9 additions and 2 deletions.
diff --git a/torchbenchmark/operators/fp8_gemm_rowwise/operator.py b/torchbenchmark/operators/fp8_gemm_rowwise/operator.py
@@ -27,8 +27,13 @@ def parse_args(args: List[str]) -> argparse.Namespace:
         "--no_fp8_fast_accum", dest="fp8_fast_accum", action="store_false"
     )
     parser.add_argument("--no_use_tma", dest="use_tma", action="store_false")
-    args = parser.parse_args(args)
-    return args
+    parser.add_argument(
+        "--no_use_persistent",
+        dest="no_use_persistent",
+        action="store_true",
+    )
+    parsed_args = parser.parse_args(args)
+    return parsed_args
 
 
 try:
@@ -115,6 +120,7 @@ def __init__(
             self.shapes = BUILDIN_SHAPES
         self.fp8_fast_accum = addmm_args.fp8_fast_accum
         self.use_tma = addmm_args.use_tma
+        self.no_use_persistent = addmm_args.no_use_persistent
 
     @register_benchmark(enabled=HAS_TRITON, baseline=True)
     def _triton(self, xq, wq, x_scale, w_scale) -> Callable:
@@ -125,6 +131,7 @@ def _triton(self, xq, wq, x_scale, w_scale) -> Callable:
             w_scale,
             fp8_fast_accum=self.fp8_fast_accum,
             tma_persistent=self.use_tma,
+            no_use_persistent=self.no_use_persistent,
         )
 
     @register_benchmark(enabled=HAS_CUTLASS)