"Note: this varlen kernel performance is as good as the non-varlen kernel shown in Nsight-Compute. As you may observe that the TFLOPS is a bit lower, that's because the unpad operation is included in the above benchmark."
)
if__name__=="__main__":
if__name__=="__main__":
arch=nvcc.get_target_compute_version()
arch=nvcc.get_target_compute_version()
...
@@ -778,6 +782,8 @@ if __name__ == "__main__":
...
@@ -778,6 +782,8 @@ if __name__ == "__main__":
parser.add_argument(
parser.add_argument(
'--use_split',action='store_true',default=False,help='Use split for dK/dV')
'--use_split',action='store_true',default=False,help='Use split for dK/dV')