diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 06784b4e0515d9cdcbaeae31660a0b1faf682703..0000000000000000000000000000000000000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,325 +0,0 @@ -cmake_minimum_required(VERSION 3.26) -project(sage_attention LANGUAGES CXX) - -set(TARGET_DEVICE "cuda" CACHE STRING "Target device backend for kernel") - -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) - -include(FetchContent) -file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists -message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") - -set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") - -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") - -include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) - -if(DEFINED Python_EXECUTABLE) - # Allow passing through the interpreter (e.g. from setup.py). - find_package(Python COMPONENTS Development Development.SABIModule Interpreter) - if (NOT Python_FOUND) - message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") - endif() -else() - find_package(Python REQUIRED COMPONENTS Development Development.SABIModule Interpreter) -endif() - -append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") - -find_package(Torch REQUIRED) - -if (NOT TARGET_DEVICE STREQUAL "cuda" AND - NOT TARGET_DEVICE STREQUAL "rocm") - return() -endif() - -if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND - CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) - set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0+PTX") -else() - set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX") -endif() - -if (NOT HIP_FOUND AND CUDA_FOUND) - set(GPU_LANG "CUDA") - - - -elseif(HIP_FOUND) - set(GPU_LANG "HIP") - - # Importing torch recognizes and sets up some HIP/ROCm configuration but does - # not let cmake recognize .hip files. In order to get cmake to understand the - # .hip extension automatically, HIP must be enabled explicitly. - enable_language(HIP) -else() - message(FATAL_ERROR "Can't find CUDA or HIP installation.") -endif() - - -if(GPU_LANG STREQUAL "CUDA") - clear_cuda_arches(CUDA_ARCH_FLAGS) - extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}") - message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") - # Filter the target architectures by the supported supported archs - # since for some files we will build for all CUDA_ARCHS. - cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") - message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") - - if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA") - list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}") - endif() - - add_compile_definitions(CUDA_KERNEL) -elseif(GPU_LANG STREQUAL "HIP") - set(ROCM_ARCHS "${HIP_SUPPORTED_ARCHS}") - # TODO: remove this once we can set specific archs per source file set. - override_gpu_arches(GPU_ARCHES - ${GPU_LANG} - "${${GPU_LANG}_SUPPORTED_ARCHS}") - - add_compile_definitions(ROCM_KERNEL) -else() - override_gpu_arches(GPU_ARCHES - ${GPU_LANG} - "${${GPU_LANG}_SUPPORTED_ARCHS}") -endif() - -get_torch_gpu_compiler_flags(TORCH_GPU_FLAGS ${GPU_LANG}) -list(APPEND GPU_FLAGS ${TORCH_GPU_FLAGS}) - -set(TORCH_sage_attention_SRC - torch-ext/torch_binding.cpp torch-ext/torch_binding.h -) - - -list(APPEND SRC "${TORCH_sage_attention_SRC}") - - -set(_qattn_sm80_SRC - "sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu" -"sage_attention/qattn/attn_cuda_sm80.h" -"sage_attention/qattn/attn_utils.cuh" -) - -# TODO: check if CLion support this: -# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories -set_source_files_properties( - ${_qattn_sm80_SRC} - PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.") - -if(GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(_qattn_sm80_ARCHS "8.0" "${CUDA_ARCHS}") - message(STATUS "Capabilities for kernel _qattn_sm80: ${_qattn_sm80_ARCHS}") - set_gencode_flags_for_srcs(SRCS "${_qattn_sm80_SRC}" CUDA_ARCHS "${_qattn_sm80_ARCHS}") - - - foreach(_KERNEL_SRC ${_qattn_sm80_SRC}) - if(_KERNEL_SRC MATCHES ".*\\.cu$") - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-O3;-std=c++17;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;--use_fast_math;--threads=1;-Xptxas=-v;-diag-suppress=174>" - ) - endif() - endforeach() - - foreach(_KERNEL_SRC ${_qattn_sm80_SRC}) - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-g;-O3;-fopenmp;-lgomp;-std=c++17;-DENABLE_BF16>" - ) - endforeach() - - list(APPEND SRC "${_qattn_sm80_SRC}") -endif() - - - -set(_qattn_sm90_SRC - "sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu" -"sage_attention/qattn/attn_cuda_sm90.h" -"sage_attention/qattn/attn_utils.cuh" -"sage_attention/cuda_tensormap_shim.cuh" -) - -# TODO: check if CLion support this: -# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories -set_source_files_properties( - ${_qattn_sm90_SRC} - PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.") - -if(GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(_qattn_sm90_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") - message(STATUS "Capabilities for kernel _qattn_sm90: ${_qattn_sm90_ARCHS}") - set_gencode_flags_for_srcs(SRCS "${_qattn_sm90_SRC}" CUDA_ARCHS "${_qattn_sm90_ARCHS}") - - - foreach(_KERNEL_SRC ${_qattn_sm90_SRC}) - if(_KERNEL_SRC MATCHES ".*\\.cu$") - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-O3;-std=c++17;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;--use_fast_math;--threads=1;-Xptxas=-v;-diag-suppress=174>" - ) - endif() - endforeach() - - foreach(_KERNEL_SRC ${_qattn_sm90_SRC}) - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-g;-O3;-fopenmp;-lgomp;-std=c++17;-DENABLE_BF16>" - ) - endforeach() - - list(APPEND SRC "${_qattn_sm90_SRC}") -endif() - - - -set(_qattn_sm89_SRC - "sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu" -"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu" -"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu" -"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu" -"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu" -"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu" -"sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu" -"sage_attention/qattn/attn_cuda_sm89.h" -"sage_attention/qattn/qk_int_sv_f8_cuda_sm89.cuh" -"sage_attention/qattn/attn_utils.cuh" -) - -# TODO: check if CLion support this: -# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories -set_source_files_properties( - ${_qattn_sm89_SRC} - PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.") - -if(GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(_qattn_sm89_ARCHS "8.9" "${CUDA_ARCHS}") - message(STATUS "Capabilities for kernel _qattn_sm89: ${_qattn_sm89_ARCHS}") - set_gencode_flags_for_srcs(SRCS "${_qattn_sm89_SRC}" CUDA_ARCHS "${_qattn_sm89_ARCHS}") - - - foreach(_KERNEL_SRC ${_qattn_sm89_SRC}) - if(_KERNEL_SRC MATCHES ".*\\.cu$") - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-O3;-std=c++17;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;--use_fast_math;--threads=1;-Xptxas=-v;-diag-suppress=174>" - ) - endif() - endforeach() - - foreach(_KERNEL_SRC ${_qattn_sm89_SRC}) - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-g;-O3;-fopenmp;-lgomp;-std=c++17;-DENABLE_BF16>" - ) - endforeach() - - list(APPEND SRC "${_qattn_sm89_SRC}") -endif() - - - -set(_fused_SRC - "sage_attention/fused/fused.cu" -"sage_attention/fused/fused.h" -) - -# TODO: check if CLion support this: -# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories -set_source_files_properties( - ${_fused_SRC} - PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.") - -if(GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(_fused_ARCHS "8.0;8.9;9.0;9.0a" "${CUDA_ARCHS}") - message(STATUS "Capabilities for kernel _fused: ${_fused_ARCHS}") - set_gencode_flags_for_srcs(SRCS "${_fused_SRC}" CUDA_ARCHS "${_fused_ARCHS}") - - - foreach(_KERNEL_SRC ${_fused_SRC}) - if(_KERNEL_SRC MATCHES ".*\\.cu$") - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-O3;-std=c++17;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;--use_fast_math;--threads=1;-Xptxas=-v;-diag-suppress=174>" - ) - endif() - endforeach() - - foreach(_KERNEL_SRC ${_fused_SRC}) - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-g;-O3;-fopenmp;-lgomp;-std=c++17;-DENABLE_BF16>" - ) - endforeach() - - list(APPEND SRC "${_fused_SRC}") -endif() - - - -set(_qattn_SRC - "sage_attention/cp_async.cuh" -"sage_attention/dispatch_utils.h" -"sage_attention/math.cuh" -"sage_attention/mma.cuh" -"sage_attention/numeric_conversion.cuh" -"sage_attention/permuted_smem.cuh" -"sage_attention/reduction_utils.cuh" -"sage_attention/wgmma.cuh" -"sage_attention/utils.cuh" -"sage_attention/cuda_tensormap_shim.cuh" -) - - -if(GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(_qattn_ARCHS "8.0;8.9;9.0;9.0a" "${CUDA_ARCHS}") - message(STATUS "Capabilities for kernel _qattn: ${_qattn_ARCHS}") - set_gencode_flags_for_srcs(SRCS "${_qattn_SRC}" CUDA_ARCHS "${_qattn_ARCHS}") - - - foreach(_KERNEL_SRC ${_qattn_SRC}) - if(_KERNEL_SRC MATCHES ".*\\.cu$") - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-O3;-std=c++17;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;--use_fast_math;--threads=1;-Xptxas=-v;-diag-suppress=174>" - ) - endif() - endforeach() - - foreach(_KERNEL_SRC ${_qattn_SRC}) - set_property( - SOURCE ${_KERNEL_SRC} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:-g;-O3;-fopenmp;-lgomp;-std=c++17;-DENABLE_BF16>" - ) - endforeach() - - list(APPEND SRC "${_qattn_SRC}") -endif() - - -define_gpu_extension_target( - _sage_attention_57cb7ec_dirty - DESTINATION _sage_attention_57cb7ec_dirty - LANGUAGE ${GPU_LANG} - SOURCES ${SRC} - COMPILE_FLAGS ${GPU_FLAGS} - ARCHITECTURES ${GPU_ARCHES} - #INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} - USE_SABI 3 - WITH_SOABI) - -target_link_options(_sage_attention_57cb7ec_dirty PRIVATE -static-libstdc++) - diff --git a/build.toml b/build.toml index 5859c1b4ec49cb40088f939bb0d26b1b77f462de..039d94ea52a91291d7a594832225480183d4c734 100644 --- a/build.toml +++ b/build.toml @@ -1,7 +1,7 @@ [general] name = "sage_attention" universal = false -cuda-minver = "12.4" +cuda-minver = "12.0" [torch] src = [ @@ -12,7 +12,7 @@ src = [ [kernel._qattn] depends = ["torch"] backend = "cuda" -cuda-minver = "12.4" +cuda-minver = "12.0" cuda-capabilities = [ "8.0", "8.9", "9.0a" ] @@ -43,7 +43,7 @@ cuda-flags = [ [kernel._qattn_sm80] depends = ["torch"] backend = "cuda" -cuda-minver = "12.4" +cuda-minver = "12.0" cuda-capabilities = [ "8.0" ] @@ -69,7 +69,7 @@ cuda-flags = [ [kernel._qattn_sm89] depends = ["torch"] backend = "cuda" -cuda-minver = "12.4" +cuda-minver = "12.0" cuda-capabilities = [ "8.9", ] @@ -102,7 +102,7 @@ cuda-flags = [ [kernel._qattn_sm90] depends = ["torch"] backend = "cuda" -cuda-minver = "12.4" +cuda-minver = "12.0" cuda-capabilities = [ "9.0a", ] @@ -127,7 +127,7 @@ cuda-flags = [ [kernel._fused] depends = ["torch"] backend = "cuda" -cuda-minver = "12.4" +cuda-minver = "12.0" cuda-capabilities = [ "8.0", "8.9", "9.0a", ] diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py index 5df69e92664edfbd0820d5126792e41e23a72762..43b5e35a4154627e6457993372bd46b2cf78b89a 100644 --- a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__init__.py @@ -1,5 +1,5 @@ from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 -from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda +from .core import sageattn __all__ = [ @@ -8,5 +8,4 @@ __all__ = [ "sub_mean", "per_channel_fp8", "sageattn", - "sageattn_qk_int8_pv_fp8_cuda", ] \ No newline at end of file diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc index df30b66c76aa862ada707a77f7670df120c1aa75..79fa384f24c4465985e069da38fb56e5319e1900 100644 Binary files a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc index 9fa0ddc67b1d0dcebef0b18f01b36cb31a6e6123..11b434abdd87fbb199cefa7547a4438820e05a34 100644 Binary files a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc index e4243ea712c6f1fc4fed24dd2be9dabf8d3634f5..1f6905f66088acd3598f890b902fbeed7cb4b2cd 100644 Binary files a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc index 5cc45e5dd169390a72ca5c0bec4b852c3d8e57be..5480c21dc23ec14d67d3fe52dadd95f24831613f 100644 Binary files a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc index abcab9981ad233aa06596e67bdc6b084e787c9f4..d15ec3820d8f1084b413179c689bc9ceba4ad6a0 100644 Binary files a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py index a47d37d024b34df8caa6f87a163063f853d7f7c4..3f5bf8b095244a566572d3b7a9045ec0408db4e5 100644 --- a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _sage_attention_af2d0c0_dirty -ops = torch.ops._sage_attention_af2d0c0_dirty +from . import _sage_attention_1369690_dirty +ops = torch.ops._sage_attention_1369690_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_sage_attention_af2d0c0_dirty::{op_name}" \ No newline at end of file + return f"_sage_attention_1369690_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..056100ced5f84e1b465e5767370ce625e33e2ff5 --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c7a3fa0f2b5db528e3854fcb72e3bc5936ed760336b96bf0e183d19fada3767 +size 26037568 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so deleted file mode 100755 index e67bf285c3343626e9b2c73b754114d033b85606..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:83f3b3d1c1371cf577a4e2c2fa3bbeef137aa93a89cf380816c14e650b1449f6 -size 26037568 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py index dc44a8e1ee17a5c5c65da5adda6faf9228cca55e..590e664e325d79068a56ee796a548f92ead07442 100644 --- a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py +++ b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/core.py @@ -363,7 +363,7 @@ def sageattn_qk_int8_pv_fp16_cuda( if pv_accum_dtype == "fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + lse = ops.qk_int8_sv_f16_accum_f32_attn( q_int8, k_int8, v, @@ -379,7 +379,7 @@ def sageattn_qk_int8_pv_fp16_cuda( elif pv_accum_dtype == "fp16": if smooth_v: smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + lse = ops.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( q_int8, k_int8, smoothed_v, @@ -395,7 +395,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) else: v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + lse = ops.qk_int8_sv_f16_accum_f16_attn( q_int8, k_int8, v, @@ -410,7 +410,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) elif pv_accum_dtype == "fp16+fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + lse = ops.qk_int8_sv_f16_accum_f16_attn_inst_buf( q_int8, k_int8, v, @@ -941,20 +941,6 @@ def sageattn_qk_int8_pv_fp8_cuda_sm90( _return_lse, ) elif pv_accum_dtype == "fp32+fp32": - print( - "qint8", - q_int8.shape, - "qscale", - q_scale.shape, - "kint8", - k_int8.shape, - "kscale", - k_scale.shape, - "vfp8", - v_fp8.shape, - "vscale", - v_scale.shape, - ) lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( q_int8, k_int8, diff --git a/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/sage_attention/layers.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py index 5df69e92664edfbd0820d5126792e41e23a72762..43b5e35a4154627e6457993372bd46b2cf78b89a 100644 --- a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__init__.py @@ -1,5 +1,5 @@ from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 -from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda +from .core import sageattn __all__ = [ @@ -8,5 +8,4 @@ __all__ = [ "sub_mean", "per_channel_fp8", "sageattn", - "sageattn_qk_int8_pv_fp8_cuda", ] \ No newline at end of file diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc index b716f15f1d75301b2ec12fcecf798765c84696bb..89e250046996d5218210dd1e5c1a0e91e013c725 100644 Binary files a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc index 13e20a4927ab8ba1eda55a957376f484b2cb5aa4..ca7b3ab23e863062a7d1d02cc6f10a6031b8b785 100644 Binary files a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc index 74f9952b6c0d3eca1ca8540cc5b00ebf5ae6911d..ed557c37f7550313550120d064b85ac3791e6213 100644 Binary files a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc index 9c64a1f4a9e6caf2ab970155e9720ac6839a7f35..b4df23f5e0576f0653e73bde218b1500ebaba668 100644 Binary files a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc index 8576961982cd5a9a1e9d8746c08859bbaa4508aa..456d8f81775a4aca8491754115e6369f4ced980d 100644 Binary files a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py index a47d37d024b34df8caa6f87a163063f853d7f7c4..3f5bf8b095244a566572d3b7a9045ec0408db4e5 100644 --- a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _sage_attention_af2d0c0_dirty -ops = torch.ops._sage_attention_af2d0c0_dirty +from . import _sage_attention_1369690_dirty +ops = torch.ops._sage_attention_1369690_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_sage_attention_af2d0c0_dirty::{op_name}" \ No newline at end of file + return f"_sage_attention_1369690_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..4f74273aea50cc1c95d9c033d1b4393a2b628a8e --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1728ddb8a13631676b67cb867e7af21388f2f4d23279805bb3b5fa11bc6119c1 +size 26553840 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so deleted file mode 100755 index ba4ec6e3db098e29d76438fce1d908f2e088544c..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:871d2abf021f7175f2a66cd9f3599fdd88c9be0c98df1bb4d09f9905d955405f -size 26553840 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py index dc44a8e1ee17a5c5c65da5adda6faf9228cca55e..590e664e325d79068a56ee796a548f92ead07442 100644 --- a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py +++ b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/core.py @@ -363,7 +363,7 @@ def sageattn_qk_int8_pv_fp16_cuda( if pv_accum_dtype == "fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + lse = ops.qk_int8_sv_f16_accum_f32_attn( q_int8, k_int8, v, @@ -379,7 +379,7 @@ def sageattn_qk_int8_pv_fp16_cuda( elif pv_accum_dtype == "fp16": if smooth_v: smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + lse = ops.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( q_int8, k_int8, smoothed_v, @@ -395,7 +395,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) else: v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + lse = ops.qk_int8_sv_f16_accum_f16_attn( q_int8, k_int8, v, @@ -410,7 +410,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) elif pv_accum_dtype == "fp16+fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + lse = ops.qk_int8_sv_f16_accum_f16_attn_inst_buf( q_int8, k_int8, v, @@ -941,20 +941,6 @@ def sageattn_qk_int8_pv_fp8_cuda_sm90( _return_lse, ) elif pv_accum_dtype == "fp32+fp32": - print( - "qint8", - q_int8.shape, - "qscale", - q_scale.shape, - "kint8", - k_int8.shape, - "kscale", - k_scale.shape, - "vfp8", - v_fp8.shape, - "vscale", - v_scale.shape, - ) lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( q_int8, k_int8, diff --git a/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/sage_attention/layers.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py index 5df69e92664edfbd0820d5126792e41e23a72762..43b5e35a4154627e6457993372bd46b2cf78b89a 100644 --- a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__init__.py @@ -1,5 +1,5 @@ from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 -from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda +from .core import sageattn __all__ = [ @@ -8,5 +8,4 @@ __all__ = [ "sub_mean", "per_channel_fp8", "sageattn", - "sageattn_qk_int8_pv_fp8_cuda", ] \ No newline at end of file diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc index 7f605fcf57bbe7727db8e69577e5a253d1b859ab..1265f608f89467b973713f14bc9ebc137ba1fe95 100644 Binary files a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc index 4d540195551fdddd65a4466136ca5ff7bc1d1074..c72bbe15717422f146f59a8c542d1fd40d477af2 100644 Binary files a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc index 5c94baf891a0c7057f4aeaa0db06a729a20e13bd..2ec52ba13bf849b28c75a35aad0247fb80641c63 100644 Binary files a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc index c8cdf959f7886f07caa5dd73f83cdaa5082195b5..d52ec4ede3e9be38fa11d10f593d65b9bbd9327e 100644 Binary files a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc index 10dedba33e4fc653daa7344de40e672dd0b466a8..6f68872ec0cedce41a8abf3c3cfbd2d4e087e488 100644 Binary files a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py index a47d37d024b34df8caa6f87a163063f853d7f7c4..3f5bf8b095244a566572d3b7a9045ec0408db4e5 100644 --- a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _sage_attention_af2d0c0_dirty -ops = torch.ops._sage_attention_af2d0c0_dirty +from . import _sage_attention_1369690_dirty +ops = torch.ops._sage_attention_1369690_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_sage_attention_af2d0c0_dirty::{op_name}" \ No newline at end of file + return f"_sage_attention_1369690_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..faac415c4ddf55aaa66929bbb468fe9edba9f1e1 --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:636461b53d3b3c4c1cd1940bc6ecb32728cb0f80bb347cf52591afd0ea121c8c +size 26037392 diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so deleted file mode 100755 index 6a3243e183fa266d04de9f91ae9db754dd22f2ef..0000000000000000000000000000000000000000 --- a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26b18ae63bccd4c5926533ffa1d0995e7bf3faf7919c0c55e1b829267ac73afd -size 26037392 diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py index dc44a8e1ee17a5c5c65da5adda6faf9228cca55e..590e664e325d79068a56ee796a548f92ead07442 100644 --- a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py +++ b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/core.py @@ -363,7 +363,7 @@ def sageattn_qk_int8_pv_fp16_cuda( if pv_accum_dtype == "fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + lse = ops.qk_int8_sv_f16_accum_f32_attn( q_int8, k_int8, v, @@ -379,7 +379,7 @@ def sageattn_qk_int8_pv_fp16_cuda( elif pv_accum_dtype == "fp16": if smooth_v: smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + lse = ops.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( q_int8, k_int8, smoothed_v, @@ -395,7 +395,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) else: v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + lse = ops.qk_int8_sv_f16_accum_f16_attn( q_int8, k_int8, v, @@ -410,7 +410,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) elif pv_accum_dtype == "fp16+fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + lse = ops.qk_int8_sv_f16_accum_f16_attn_inst_buf( q_int8, k_int8, v, @@ -941,20 +941,6 @@ def sageattn_qk_int8_pv_fp8_cuda_sm90( _return_lse, ) elif pv_accum_dtype == "fp32+fp32": - print( - "qint8", - q_int8.shape, - "qscale", - q_scale.shape, - "kint8", - k_int8.shape, - "kscale", - k_scale.shape, - "vfp8", - v_fp8.shape, - "vscale", - v_scale.shape, - ) lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( q_int8, k_int8, diff --git a/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu126-x86_64-linux/sage_attention/layers.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py index 5df69e92664edfbd0820d5126792e41e23a72762..43b5e35a4154627e6457993372bd46b2cf78b89a 100644 --- a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__init__.py @@ -1,5 +1,5 @@ from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 -from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda +from .core import sageattn __all__ = [ @@ -8,5 +8,4 @@ __all__ = [ "sub_mean", "per_channel_fp8", "sageattn", - "sageattn_qk_int8_pv_fp8_cuda", ] \ No newline at end of file diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc index cbaf8e8a207ca0203bf23e3c294e687f0a58f280..456b0fbf7a0ba3ec6a2dc36e26287b1a6944c130 100644 Binary files a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc index ba940db87cdb7343333520574fdf5715a9d29745..163ddb9c51de1f8e9df42d7a05dbb63fbe5313b0 100644 Binary files a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc index 6ced4344b438ed70d903827c63856e4d41da7eac..a3945ef7e2336f01d729258df5003ef84fa7449a 100644 Binary files a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc index ae3a110d1a30de4353d0e261dbbc18ebec86847f..a6b1e337187a955c72c026fa0c5e56ed15b65b99 100644 Binary files a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc index d151e75d0f59cea853edd09166e8160300620a86..e15aa4d4625174af8fc475ae5b5db3b6ed04f1be 100644 Binary files a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py index a47d37d024b34df8caa6f87a163063f853d7f7c4..3f5bf8b095244a566572d3b7a9045ec0408db4e5 100644 --- a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _sage_attention_af2d0c0_dirty -ops = torch.ops._sage_attention_af2d0c0_dirty +from . import _sage_attention_1369690_dirty +ops = torch.ops._sage_attention_1369690_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_sage_attention_af2d0c0_dirty::{op_name}" \ No newline at end of file + return f"_sage_attention_1369690_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..70be665802f3e6d80bab87150ebf7cbfc7141622 --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bfc9cab8f63665571d07c111a94edb3bec9a17aba7721a4c67be5392db0841d +size 26553920 diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so deleted file mode 100755 index 8356782957eb9dc5a4f301d18eb77aa0d79106d6..0000000000000000000000000000000000000000 --- a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2681241cb3fee535e10ba52179293982bca60a5fed972404fdec8ae5fa848175 -size 26549824 diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py index dc44a8e1ee17a5c5c65da5adda6faf9228cca55e..590e664e325d79068a56ee796a548f92ead07442 100644 --- a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py +++ b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/core.py @@ -363,7 +363,7 @@ def sageattn_qk_int8_pv_fp16_cuda( if pv_accum_dtype == "fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + lse = ops.qk_int8_sv_f16_accum_f32_attn( q_int8, k_int8, v, @@ -379,7 +379,7 @@ def sageattn_qk_int8_pv_fp16_cuda( elif pv_accum_dtype == "fp16": if smooth_v: smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + lse = ops.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( q_int8, k_int8, smoothed_v, @@ -395,7 +395,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) else: v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + lse = ops.qk_int8_sv_f16_accum_f16_attn( q_int8, k_int8, v, @@ -410,7 +410,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) elif pv_accum_dtype == "fp16+fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + lse = ops.qk_int8_sv_f16_accum_f16_attn_inst_buf( q_int8, k_int8, v, @@ -941,20 +941,6 @@ def sageattn_qk_int8_pv_fp8_cuda_sm90( _return_lse, ) elif pv_accum_dtype == "fp32+fp32": - print( - "qint8", - q_int8.shape, - "qscale", - q_scale.shape, - "kint8", - k_int8.shape, - "kscale", - k_scale.shape, - "vfp8", - v_fp8.shape, - "vscale", - v_scale.shape, - ) lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( q_int8, k_int8, diff --git a/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu128-x86_64-linux/sage_attention/layers.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py index 5df69e92664edfbd0820d5126792e41e23a72762..43b5e35a4154627e6457993372bd46b2cf78b89a 100644 --- a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__init__.py @@ -1,5 +1,5 @@ from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 -from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda +from .core import sageattn __all__ = [ @@ -8,5 +8,4 @@ __all__ = [ "sub_mean", "per_channel_fp8", "sageattn", - "sageattn_qk_int8_pv_fp8_cuda", ] \ No newline at end of file diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc index ee05f0227a0bb3c4cc41aa2fdfd1be61751b16b0..77ee0c5cb4d71e324a08437085329d613bddb942 100644 Binary files a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc index dc01cff523b27fa1a7f05a443e1cec8d6edbe91f..3304fc1253a1ff2b04ec288f556cfff1ecc2ca92 100644 Binary files a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/_ops.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc index b47fe92066cbb0c8eff70a0c218eca491fbc882c..1feb3f559f17390d27dc3266976cfbefc3c83c1e 100644 Binary files a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/core.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc index a1794e2144e0db4eac5b4fa09b9249eda17a07ed..e7e00ede7b0bf73713e2545e8c3b136e3b521290 100644 Binary files a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc index 2f2a696a2a894a3039f4852d97807db73b4ff334..ffaef41899636fbeae8a2725f47d9b70074f7ed3 100644 Binary files a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/__pycache__/quant_per_thread.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py index a47d37d024b34df8caa6f87a163063f853d7f7c4..3f5bf8b095244a566572d3b7a9045ec0408db4e5 100644 --- a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _sage_attention_af2d0c0_dirty -ops = torch.ops._sage_attention_af2d0c0_dirty +from . import _sage_attention_1369690_dirty +ops = torch.ops._sage_attention_1369690_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_sage_attention_af2d0c0_dirty::{op_name}" \ No newline at end of file + return f"_sage_attention_1369690_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..3a4768d30801b4411c86872d8fe296d0a27161ff --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_1369690_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fef61629b7537ad41d31cf5715a11a38ce3f7cc97b0d5bf26356492b36ad5c29 +size 26608048 diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so deleted file mode 100755 index e67830941fda31e45c6e2558b355a1cce6ebc2f7..0000000000000000000000000000000000000000 --- a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff47cafcc3abed4dc02589ee11c315f3b88f65a0510caa89a07825ccd8ea1a48 -size 26608048 diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py index dc44a8e1ee17a5c5c65da5adda6faf9228cca55e..590e664e325d79068a56ee796a548f92ead07442 100644 --- a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py +++ b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/core.py @@ -363,7 +363,7 @@ def sageattn_qk_int8_pv_fp16_cuda( if pv_accum_dtype == "fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + lse = ops.qk_int8_sv_f16_accum_f32_attn( q_int8, k_int8, v, @@ -379,7 +379,7 @@ def sageattn_qk_int8_pv_fp16_cuda( elif pv_accum_dtype == "fp16": if smooth_v: smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + lse = ops.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( q_int8, k_int8, smoothed_v, @@ -395,7 +395,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) else: v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + lse = ops.qk_int8_sv_f16_accum_f16_attn( q_int8, k_int8, v, @@ -410,7 +410,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) elif pv_accum_dtype == "fp16+fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + lse = ops.qk_int8_sv_f16_accum_f16_attn_inst_buf( q_int8, k_int8, v, @@ -941,20 +941,6 @@ def sageattn_qk_int8_pv_fp8_cuda_sm90( _return_lse, ) elif pv_accum_dtype == "fp32+fp32": - print( - "qint8", - q_int8.shape, - "qscale", - q_scale.shape, - "kint8", - k_int8.shape, - "kscale", - k_scale.shape, - "vfp8", - v_fp8.shape, - "vscale", - v_scale.shape, - ) lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( q_int8, k_int8, diff --git a/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/layers.py b/build/torch28-cxx11-cu129-x86_64-linux/sage_attention/layers.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/cmake/hipify.py b/cmake/hipify.py deleted file mode 100644 index a1539c02a297d2f9abe66700c18c168079c6987a..0000000000000000000000000000000000000000 --- a/cmake/hipify.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: Apache-2.0 - -# From vLLM: https://github.com/vllm-project/vllm/blob/main/cmake/hipify.py - -# -# A command line tool for running pytorch's hipify preprocessor on CUDA -# source files. -# -# See https://github.com/ROCm/hipify_torch -# and /utils/hipify/hipify_python.py -# - -import argparse -import os -import shutil - -from torch.utils.hipify.hipify_python import hipify - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - # Project directory where all the source + include files live. - parser.add_argument( - "-p", - "--project_dir", - help="The project directory.", - ) - - # Directory where hipified files are written. - parser.add_argument( - "-o", - "--output_dir", - help="The output directory.", - ) - - # Source files to convert. - parser.add_argument("sources", - help="Source files to hipify.", - nargs="*", - default=[]) - - args = parser.parse_args() - - # Limit include scope to project_dir only - includes = [os.path.join(args.project_dir, '*')] - - # Get absolute path for all source files. - extra_files = [os.path.abspath(s) for s in args.sources] - - # Copy sources from project directory to output directory. - # The directory might already exist to hold object files so we ignore that. - shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) - - hipify_result = hipify(project_directory=args.project_dir, - output_directory=args.output_dir, - header_include_dirs=[], - includes=includes, - extra_files=extra_files, - show_detailed=True, - is_pytorch_extension=True, - hipify_extra_files_only=True) - - hipified_sources = [] - for source in args.sources: - s_abs = os.path.abspath(source) - hipified_s_abs = (hipify_result[s_abs].hipified_path if - (s_abs in hipify_result - and hipify_result[s_abs].hipified_path is not None) - else s_abs) - hipified_sources.append(hipified_s_abs) - - assert (len(hipified_sources) == len(args.sources)) - - # Print hipified source files. - print("\n".join(hipified_sources)) diff --git a/cmake/utils.cmake b/cmake/utils.cmake deleted file mode 100644 index 6c87d51ed6cb9b0cc07af011190b7e2cec6d8a58..0000000000000000000000000000000000000000 --- a/cmake/utils.cmake +++ /dev/null @@ -1,545 +0,0 @@ -# Vendored from vLLM: -# -# https://github.com/vllm-project/vllm/blob/main/cmake/utils.cmake -# -# Attempt to find the python package that uses the same python executable as -# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. -# -macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) - file(REAL_PATH ${EXECUTABLE} EXECUTABLE) - set(Python_EXECUTABLE ${EXECUTABLE}) - find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule) - if (NOT Python_FOUND) - message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") - endif() - set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") - set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) - if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST) - message(FATAL_ERROR - "Python version (${_VER}) is not one of the supported versions: " - "${_SUPPORTED_VERSIONS_LIST}.") - endif() - message(STATUS "Found python matching: ${EXECUTABLE}.") -endmacro() - -# -# Run `EXPR` in python. The standard output of python is stored in `OUT` and -# has trailing whitespace stripped. If an error is encountered when running -# python, a fatal message `ERR_MSG` is issued. -# -function (run_python OUT EXPR ERR_MSG) - execute_process( - COMMAND - "${Python_EXECUTABLE}" "-c" "${EXPR}" - OUTPUT_VARIABLE PYTHON_OUT - RESULT_VARIABLE PYTHON_ERROR_CODE - ERROR_VARIABLE PYTHON_STDERR - OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(NOT PYTHON_ERROR_CODE EQUAL 0) - message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") - endif() - set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) -endfunction() - -# Run `EXPR` in python after importing `PKG`. Use the result of this to extend -# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. -macro (append_cmake_prefix_path PKG EXPR) - run_python(_PREFIX_PATH - "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") - list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) -endmacro() - -# -# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set -# of CUDA source files. The names of the corresponding "hipified" sources are -# stored in `OUT_SRCS`. -# -function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) - # - # Split into C++ and non-C++ (i.e. CUDA) sources. - # - set(NODUP_SRCS ${ORIG_SRCS}) - list(REMOVE_DUPLICATES NODUP_SRCS) - set(SRCS ${NODUP_SRCS}) - set(CXX_SRCS ${NODUP_SRCS}) - list(FILTER SRCS INCLUDE REGEX "\.cu$") - list(FILTER CXX_SRCS EXCLUDE REGEX "\.cu$") - - # - # Generate ROCm/HIP source file names from CUDA file names. - # Since HIP files are generated code, they will appear in the build area - # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. - # - set(HIP_SRCS) - foreach (SRC ${SRCS}) - get_source_file_property(include_dirs "${SRC}" INCLUDE_DIRECTORIES) - string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) - string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) - - if(include_dirs) - # Copy over include directories from the original CUDA file. - set_source_files_properties( - ${SRC} - PROPERTIES INCLUDE_DIRECTORIES "${include_dirs}") - endif() - - list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") - endforeach() - - add_custom_target( - hipify${NAME} - COMMAND "${Python_EXECUTABLE}" ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR} -o ${CMAKE_CURRENT_BINARY_DIR} ${SRCS} - DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} - BYPRODUCTS ${HIP_SRCS} - COMMENT "Running hipify on ${NAME} extension source files.") - - # Swap out original extension sources with hipified sources. - list(APPEND HIP_SRCS ${CXX_SRCS}) - set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE) -endfunction() - -# -# Get additional GPU compiler flags from torch. -# -function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) - if (${GPU_LANG} STREQUAL "CUDA") - # - # Get common NVCC flags from torch. - # - run_python(GPU_FLAGS - "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" - "Failed to determine torch nvcc compiler flags") - - if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) - list(APPEND GPU_FLAGS "-DENABLE_FP8") - list(REMOVE_ITEM GPU_FLAGS - "-D__CUDA_NO_HALF_OPERATORS__" - "-D__CUDA_NO_HALF_CONVERSIONS__" - "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" - "-D__CUDA_NO_HALF2_OPERATORS__") - endif() - - elseif(${GPU_LANG} STREQUAL "HIP") - # - # Get common HIP/HIPCC flags from torch. - # - run_python(GPU_FLAGS - "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))" - "Failed to determine torch nvcc compiler flags") - - list(APPEND GPU_FLAGS - "-DUSE_ROCM" - "-DENABLE_FP8" - "-U__HIP_NO_HALF_CONVERSIONS__" - "-U__HIP_NO_HALF_OPERATORS__" - "-fno-gpu-rdc") - - endif() - set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE) -endfunction() - -# Macro for converting a `gencode` version number to a cmake version number. -macro(string_to_ver OUT_VER IN_STR) - string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) -endmacro() - -# -# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in -# `CUDA_ARCH_FLAGS`. -# -# Example: -# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" -# clear_cuda_arches(CUDA_ARCH_FLAGS) -# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75" -# CMAKE_CUDA_FLAGS="-Wall" -# -macro(clear_cuda_arches CUDA_ARCH_FLAGS) - # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS` - string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS - ${CMAKE_CUDA_FLAGS}) - - # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified - # and passed back via the `CUDA_ARCHITECTURES` property. - string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS - ${CMAKE_CUDA_FLAGS}) -endmacro() - -# -# Extract unique CUDA architectures from a list of compute capabilities codes in -# the form `[]`, convert them to the form sort -# `.`, dedupes them and then sorts them in ascending order and -# stores them in `OUT_ARCHES`. -# -# Example: -# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" -# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS) -# OUT_ARCHES="7.5;...;9.0" -function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS) - set(_CUDA_ARCHES) - foreach(_ARCH ${CUDA_ARCH_FLAGS}) - string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH}) - if (_COMPUTE) - set(_COMPUTE ${CMAKE_MATCH_1}) - endif() - - string_to_ver(_COMPUTE_VER ${_COMPUTE}) - list(APPEND _CUDA_ARCHES ${_COMPUTE_VER}) - endforeach() - - list(REMOVE_DUPLICATES _CUDA_ARCHES) - list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING) - set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE) -endfunction() - -# -# For a specific file set the `-gencode` flag in compile options conditionally -# for the CUDA language. -# -# Example: -# set_gencode_flag_for_srcs( -# SRCS "foo.cu" -# ARCH "compute_75" -# CODE "sm_75") -# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for -# `foo.cu` (only for the CUDA language). -# -macro(set_gencode_flag_for_srcs) - set(options) - set(oneValueArgs ARCH CODE) - set(multiValueArgs SRCS) - cmake_parse_arguments(arg "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN} ) - set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE}) - set_property( - SOURCE ${arg_SRCS} - APPEND PROPERTY - COMPILE_OPTIONS "$<$:${_FLAG}>" - ) - - message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}") -endmacro(set_gencode_flag_for_srcs) - -# -# For a list of source files set the `-gencode` flags in the files specific -# compile options (specifically for the CUDA language). -# -# arguments are: -# SRCS: list of source files -# CUDA_ARCHS: list of CUDA architectures in the form `.[letter]` -# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built -# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS -# that is larger than BUILD_PTX_FOR_ARCH. -# -macro(set_gencode_flags_for_srcs) - set(options) - set(oneValueArgs BUILD_PTX_FOR_ARCH) - set(multiValueArgs SRCS CUDA_ARCHS) - cmake_parse_arguments(arg "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN} ) - - foreach(_ARCH ${arg_CUDA_ARCHS}) - # handle +PTX suffix: generate both sm and ptx codes if requested - string(FIND "${_ARCH}" "+PTX" _HAS_PTX) - if(NOT _HAS_PTX EQUAL -1) - string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}") - string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}") - set_gencode_flag_for_srcs( - SRCS ${arg_SRCS} - ARCH "compute_${_STRIPPED_ARCH}" - CODE "sm_${_STRIPPED_ARCH}") - set_gencode_flag_for_srcs( - SRCS ${arg_SRCS} - ARCH "compute_${_STRIPPED_ARCH}" - CODE "compute_${_STRIPPED_ARCH}") - else() - string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}") - set_gencode_flag_for_srcs( - SRCS ${arg_SRCS} - ARCH "compute_${_STRIPPED_ARCH}" - CODE "sm_${_STRIPPED_ARCH}") - endif() - endforeach() - - if (${arg_BUILD_PTX_FOR_ARCH}) - list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) - list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH) - if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH}) - string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}") - set_gencode_flag_for_srcs( - SRCS ${arg_SRCS} - ARCH "compute_${_PTX_ARCH}" - CODE "compute_${_PTX_ARCH}") - endif() - endif() -endmacro() - -# -# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form -# `.[letter]` compute the "loose intersection" with the -# `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in -# `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there -# is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the -# architecture in `SRC_CUDA_ARCHS`. -# The loose intersection is defined as: -# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} } -# where `<=` is the version comparison operator. -# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version -# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. -# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is -# in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add -# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). -# The result is stored in `OUT_CUDA_ARCHS`. -# -# Example: -# SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a" -# TGT_CUDA_ARCHS="8.0;8.9;9.0" -# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) -# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a" -# -# Example With PTX: -# SRC_CUDA_ARCHS="8.0+PTX" -# TGT_CUDA_ARCHS="9.0" -# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) -# OUT_CUDA_ARCHS="8.0+PTX" -# -function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) - set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}") - set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS}) - - # handle +PTX suffix: separate base arch for matching, record PTX requests - set(_PTX_ARCHS) - foreach(_arch ${_SRC_CUDA_ARCHS}) - if(_arch MATCHES "\\+PTX$") - string(REPLACE "+PTX" "" _base "${_arch}") - list(APPEND _PTX_ARCHS "${_base}") - list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") - list(APPEND _SRC_CUDA_ARCHS "${_base}") - endif() - endforeach() - list(REMOVE_DUPLICATES _PTX_ARCHS) - list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS) - - # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should - # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS - set(_CUDA_ARCHS) - foreach(_arch ${_SRC_CUDA_ARCHS}) - if(_arch MATCHES "\\a$") - list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") - string(REPLACE "a" "" _base "${_arch}") - if ("${_base}" IN_LIST TGT_CUDA_ARCHS) - list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}") - list(APPEND _CUDA_ARCHS "${_arch}") - endif() - endif() - endforeach() - - list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) - - # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that - # is less or equal to ARCH (but has the same major version since SASS binary - # compatibility is only forward compatible within the same major version). - foreach(_ARCH ${_TGT_CUDA_ARCHS}) - set(_TMP_ARCH) - # Extract the major version of the target arch - string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") - foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS}) - # Extract the major version of the source arch - string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") - # Check version-less-or-equal, and allow PTX arches to match across majors - if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) - if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) - set(_TMP_ARCH "${_SRC_ARCH}") - endif() - else() - # If we hit a version greater than the target, we can break - break() - endif() - endforeach() - - # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS - if (_TMP_ARCH) - list(APPEND _CUDA_ARCHS "${_TMP_ARCH}") - endif() - endforeach() - - list(REMOVE_DUPLICATES _CUDA_ARCHS) - - # reapply +PTX suffix to architectures that requested PTX - set(_FINAL_ARCHS) - foreach(_arch ${_CUDA_ARCHS}) - if(_arch IN_LIST _PTX_ARCHS) - list(APPEND _FINAL_ARCHS "${_arch}+PTX") - else() - list(APPEND _FINAL_ARCHS "${_arch}") - endif() - endforeach() - set(_CUDA_ARCHS ${_FINAL_ARCHS}) - - set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) -endfunction() - -# -# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form -# `` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list. -# The loose intersection is defined as: -# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} } -# where `<=` is the version comparison operator. -# In other words, for each version in `TGT_ROCM_ARCHS` find the highest version -# in `SRC_ROCM_ARCHS` that is less or equal to the version in `TGT_ROCM_ARCHS`. -# The result is stored in `OUT_ROCM_ARCHS`. -# -# Example: -# SRC_ROCM_ARCHS="gfx900;gfx906;gfx908;gfx90a" -# TGT_ROCM_ARCHS="gfx906;gfx908;gfx1030" -# hip_archs_loose_intersection(OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS) -# OUT_ROCM_ARCHS="gfx906;gfx908" -# -function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS) - list(REMOVE_DUPLICATES SRC_ROCM_ARCHS) - - # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit - # and x is a letter. We can sort them by string comparison which works for this format. - list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING) - - set(_ROCM_ARCHS) - - # Find the intersection of supported architectures - foreach(_SRC_ARCH ${SRC_ROCM_ARCHS}) - if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS) - list(APPEND _ROCM_ARCHS ${_SRC_ARCH}) - endif() - endforeach() - - list(REMOVE_DUPLICATES _ROCM_ARCHS) - set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE) -endfunction() - -# -# Override the GPU architectures detected by cmake/torch and filter them by -# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in -# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set -# the architectures on a per file basis. -# -# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`. -# -macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) - set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) - message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}") - - if (${GPU_LANG} STREQUAL "HIP") - # - # `GPU_ARCHES` controls the `--offload-arch` flags. - # - # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list, - # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling - # "rocm_agent_enumerator" in "enable_language(HIP)" - # (in file Modules/CMakeDetermineHIPCompiler.cmake) - # - if(DEFINED ENV{PYTORCH_ROCM_ARCH}) - set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH}) - else() - set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES}) - endif() - # - # Find the intersection of the supported + detected architectures to - # set the module architecture flags. - # - set(${GPU_ARCHES}) - foreach (_ARCH ${HIP_ARCHITECTURES}) - if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) - list(APPEND ${GPU_ARCHES} ${_ARCH}) - endif() - endforeach() - - if(NOT ${GPU_ARCHES}) - message(FATAL_ERROR - "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" - " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") - endif() - endif() -endmacro() - -# -# Define a target named `GPU_MOD_NAME` for a single extension. The -# arguments are: -# -# DESTINATION - Module destination directory. -# LANGUAGE - The GPU language for this module, e.g CUDA, HIP, -# etc. -# SOURCES - List of source files relative to CMakeLists.txt -# directory. -# -# Optional arguments: -# -# ARCHITECTURES - A list of target GPU architectures in cmake -# format. -# Refer `CMAKE_CUDA_ARCHITECTURES` documentation -# and `CMAKE_HIP_ARCHITECTURES` for more info. -# ARCHITECTURES will use cmake's defaults if -# not provided. -# COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip. -# INCLUDE_DIRECTORIES - Extra include directories. -# LIBRARIES - Extra link libraries. -# WITH_SOABI - Generate library with python SOABI suffix name. -# USE_SABI - Use python stable api -# -# Note: optimization level/debug info is set via cmake build type. -# -function (define_gpu_extension_target GPU_MOD_NAME) - cmake_parse_arguments(PARSE_ARGV 1 - GPU - "WITH_SOABI" - "DESTINATION;LANGUAGE;USE_SABI" - "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES") - - # Add hipify preprocessing step when building with HIP/ROCm. - if (GPU_LANGUAGE STREQUAL "HIP") - hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}") - endif() - - if (GPU_WITH_SOABI) - set(GPU_WITH_SOABI WITH_SOABI) - else() - set(GPU_WITH_SOABI) - endif() - - if (GPU_USE_SABI) - Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}") - else() - Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}") - endif() - - if (GPU_LANGUAGE STREQUAL "HIP") - # Make this target dependent on the hipify preprocessor step. - add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) - endif() - - if (GPU_ARCHITECTURES) - set_target_properties(${GPU_MOD_NAME} PROPERTIES - ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") - endif() - - set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) - - target_compile_options(${GPU_MOD_NAME} PRIVATE - $<$:${GPU_COMPILE_FLAGS}>) - - target_compile_definitions(${GPU_MOD_NAME} PRIVATE - "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") - - target_include_directories(${GPU_MOD_NAME} PRIVATE csrc - ${GPU_INCLUDE_DIRECTORIES}) - - target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) - - # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of - # dependencies that are not necessary and may not be installed. - if (GPU_LANGUAGE STREQUAL "CUDA") - target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart) - else() - target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) - endif() - - install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME}) -endfunction() diff --git a/nix-build.log b/nix-build.log index 9198b7d7dd2ddd4bb3c44d5a3aa6be579629643d..94e123ed87c19e81cfc58cea52d3d18fd9089f35 100644 --- a/nix-build.log +++ b/nix-build.log @@ -10,18 +10,18 @@ evaluation warning: `rev` argument of `genFlakeOutputs` is deprecated, pass `sel path = ./.; }; these 7 derivations will be built: - /nix/store/d04lffjyka9nfvrhmr8863813bwkcn0w-sage_attention-torch-ext.drv - /nix/store/g8li7ymzmry7mpxm9k43zkmhzrk2nxsz-sage_attention-torch-ext.drv - /nix/store/gaqkbs2b2k9x1yh88ax8vb5rnnl81xmy-sage_attention-torch-ext.drv - /nix/store/jlfw8d4bqv1cgckbkmjsam39djmgjsl1-sage_attention-torch-ext.drv - /nix/store/p79rdm4kvf0jr7vkv277nbi5mmc2lwyb-sage_attention-torch-ext.drv - /nix/store/zcfc2w942q3a6lpp77cmz64zdis9i1dz-torch-ext-bundle.drv - /nix/store/q2d20wl8cfvw82mp757i59cvq8z9wmpv-build-and-copy.drv -building '/nix/store/d04lffjyka9nfvrhmr8863813bwkcn0w-sage_attention-torch-ext.drv'... -building '/nix/store/g8li7ymzmry7mpxm9k43zkmhzrk2nxsz-sage_attention-torch-ext.drv'... -building '/nix/store/gaqkbs2b2k9x1yh88ax8vb5rnnl81xmy-sage_attention-torch-ext.drv'... -building '/nix/store/jlfw8d4bqv1cgckbkmjsam39djmgjsl1-sage_attention-torch-ext.drv'... -building '/nix/store/p79rdm4kvf0jr7vkv277nbi5mmc2lwyb-sage_attention-torch-ext.drv'... + /nix/store/66w6nb3dq6bjb0bils6wjgx4jz6bhchj-sage_attention-torch-ext.drv + /nix/store/6i491c8acr1vm7lb8xfzddscbgzqyjwl-sage_attention-torch-ext.drv + /nix/store/hdl8in3dz7ijjx0l9sk7cd78a66z644f-sage_attention-torch-ext.drv + /nix/store/vrayd3xii2nak1sl95dn7cw4irmy21b5-sage_attention-torch-ext.drv + /nix/store/xqrfdlnc06wy3nl55ypzrqk470gldyn4-sage_attention-torch-ext.drv + /nix/store/ll29z68nd2l37pyv7ihc5sw3qv1wffjp-torch-ext-bundle.drv + /nix/store/dyhilzcxbx6rq6fhn6kpnhig1jf5rv4c-build-and-copy.drv +building '/nix/store/66w6nb3dq6bjb0bils6wjgx4jz6bhchj-sage_attention-torch-ext.drv'... +building '/nix/store/6i491c8acr1vm7lb8xfzddscbgzqyjwl-sage_attention-torch-ext.drv'... +building '/nix/store/hdl8in3dz7ijjx0l9sk7cd78a66z644f-sage_attention-torch-ext.drv'... +building '/nix/store/vrayd3xii2nak1sl95dn7cw4irmy21b5-sage_attention-torch-ext.drv'... +building '/nix/store/xqrfdlnc06wy3nl55ypzrqk470gldyn4-sage_attention-torch-ext.drv'... sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh sage_attention-torch-ext> Sourcing setup-cuda-hook sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh @@ -33,61 +33,64 @@ sage_attention-torch-ext> Sourcing setup-cuda-hook sage_attention-torch-ext> Sourcing get-kernel-check-hook.sh sage_attention-torch-ext> Sourcing setup-cuda-hook sage_attention-torch-ext> Running phase: unpackPhase +sage_attention-torch-ext> unpacking source archive /nix/store/6d6az24clsfzyy97rmzcdk0qax7l7c4p-source +sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Running phase: patchPhase +sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase +sage_attention-torch-ext> Running phase: configurePhase sage_attention-torch-ext> Running phase: unpackPhase -sage_attention-torch-ext> unpacking source archive /nix/store/9ns9nazv5cyxcnpq9fw96ccinmp18kv1-source -sage_attention-torch-ext> unpacking source archive /nix/store/9ns9nazv5cyxcnpq9fw96ccinmp18kv1-source sage_attention-torch-ext> Running phase: unpackPhase -sage_attention-torch-ext> unpacking source archive /nix/store/9ns9nazv5cyxcnpq9fw96ccinmp18kv1-source -sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> fixing cmake files... +sage_attention-torch-ext> unpacking source archive /nix/store/6d6az24clsfzyy97rmzcdk0qax7l7c4p-source +sage_attention-torch-ext> unpacking source archive /nix/store/6d6az24clsfzyy97rmzcdk0qax7l7c4p-source sage_attention-torch-ext> source root is source sage_attention-torch-ext> source root is source +sage_attention-torch-ext> Running phase: unpackPhase sage_attention-torch-ext> Running phase: patchPhase sage_attention-torch-ext> Running phase: patchPhase -sage_attention-torch-ext> Running phase: patchPhase -sage_attention-torch-ext> Running phase: unpackPhase +sage_attention-torch-ext> unpacking source archive /nix/store/6d6az24clsfzyy97rmzcdk0qax7l7c4p-source +sage_attention-torch-ext> source root is source sage_attention-torch-ext> Running phase: unpackPhase -sage_attention-torch-ext> unpacking source archive /nix/store/9ns9nazv5cyxcnpq9fw96ccinmp18kv1-source -sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase -sage_attention-torch-ext> unpacking source archive /nix/store/9ns9nazv5cyxcnpq9fw96ccinmp18kv1-source -sage_attention-torch-ext> source root is source -sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> unpacking source archive /nix/store/6d6az24clsfzyy97rmzcdk0qax7l7c4p-source +sage_attention-torch-ext> Running phase: patchPhase sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages sage_attention-torch-ext> Running phase: configurePhase sage_attention-torch-ext> source root is source -sage_attention-torch-ext> Running phase: patchPhase -sage_attention-torch-ext> Running phase: patchPhase -sage_attention-torch-ext> Executing setupCUDAToolkitCompilers -sage_attention-torch-ext> fixing cmake files... sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase +sage_attention-torch-ext> Running phase: patchPhase sage_attention-torch-ext> Executing setupCUDAToolkitCompilers sage_attention-torch-ext> fixing cmake files... sage_attention-torch-ext> Executing setupCUDAToolkitCompilers -sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase sage_attention-torch-ext> fixing cmake files... sage_attention-torch-ext> Running phase: configurePhase -sage_attention-torch-ext> Running phase: configurePhase +sage_attention-torch-ext> Running phase: updateAutotoolsGnuConfigScriptsPhase sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> Running phase: configurePhase sage_attention-torch-ext> fixing cmake files... sage_attention-torch-ext> Executing setupCUDAToolkitCompilers +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages sage_attention-torch-ext> fixing cmake files... -sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev/include\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev/include\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev/include\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev/include\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev/include\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev/include\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev/include\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev/include\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev/include\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev/include\;/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev/include\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev/include -DCUDAToolkit_ROOT=/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86\;/nix/store/q2al0drhrl0yxk97xbsjl8d0h25kmsq9-libcurand-10.3.10.19-lib\;/nix/store/ax1ssn45048qbmyy19basgv6q64y5jy0-cuda_cupti-12.9.79\;/nix/store/m09542l6q83flp3asv2r4j3wcbjqksvg-libcufile-1.14.1.1-static\;/nix/store/b3wbcra9cziq8bwf3yhmj2nn1mf5bqy2-cuda_cudart-12.9.79-lib\;/nix/store/j5kp5fg9mn6hhslk18wbmskc7v96l353-cuda_cupti-12.9.79-static\;/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev\;/nix/store/3s79bz4ldkhlks6jf9a2jd4r34y6018b-libcurand-10.3.10.19\;/nix/store/v48xzq66pzmygxqkws17n9nvpa7lad9d-cuda_nvml_dev-12.9.79\;/nix/store/6via2axi1n31n685jii6dwaiqca8b2rc-cuda_nvcc-12.9.86-static\;/nix/store/v0hx9fqdlmz9kvjd9sqr2zc141ny10yn-cuda_profiler_api-12.9.79\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev\;/nix/store/8cig7k11qv5g8x0j8n2mbdfzwrnf7cg2-cuda_cudart-12.9.79-stubs\;/nix/store/xg8pj5m74n2h3v8kgxbvmbpcl90rzmlx-cudnn-9.11.0.98-static\;/nix/store/v4b7mkhyq1akczzkcyynj7y9c61l9dc7-cuda_cudart-12.9.79-static\;/nix/store/hw2swakbrvi4innrymcw8i2m98p73br0-cuda_cupti-12.9.79-sample\;/nix/store/s1i2kadnni2m4skpzzqzfzc3bpmrxi7p-libcusparse-12.5.10.65-lib\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev\;/nix/store/0a83zdhkh2i9d97r4zqdn8fi8vn4wfk3-libcublas-12.9.1.4-static\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev\;/nix/store/jnhjz87sm9nbnb72n54jj2l99szrzpg2-libcusparse-12.5.10.65\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev\;/nix/store/d1m6c5i6y6ncjygpdmv1b4pmd91hvjr2-cuda_cupti-12.9.79-lib\;/nix/store/49p6af3v11dcxvq9andr6l8csa2sr4j4-cuda_nvrtc-12.9.86-static\;/nix/store/bfygrgghga26l7br5d5j3h6hd1s21rkn-cudnn-9.11.0.98\;/nix/store/a6an9chi5dvjsybrfrxql0bn76xswzpa-libcufft-11.4.1.4\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev\;/nix/store/7zy91byrxpnyzhjlwham2gqyir2x6f54-libcusolver-11.7.5.82-lib\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev\;/nix/store/cx0hyla7fkqqc5hh1gn4hkarjyjvbjhf-libcusparse-12.5.10.65-static\;/nix/store/3yi8kx62nklnyn77zn4z23hi03l9c7ff-libcusolver-11.7.5.82-static\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev\;/nix/store/86nq76ks8vlgjdsnh1hkskyfw7mm3plc-cuda_cccl-12.9.27\;/nix/store/01ywykdxfkvp64318anifgx7zaavz9ql-cuda_nvml_dev-12.9.79-lib\;/nix/store/qv2m9i0nby2p03xx37mkkm84dlqb9s84-cuda_cudart-12.9.79\;/nix/store/a09saq5rl5jxbgv9gqllx0080ypjk00x-libcufile-1.14.1.1-lib\;/nix/store/0l18n4dhavr0p4rk0nyqqjr8paacak13-libcufile-1.14.1.1\;/nix/store/r8ly0w88qv4gw3lhd784ha0ag221c23s-cuda_nvrtc-12.9.86-lib\;/nix/store/rngn6cls1blhilrw78xb3pjgwghibhzk-libcurand-10.3.10.19-static\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev\;/nix/store/ikw7sqic4kknjkp50dr54khgs06q1hbv-cuda_nvml_dev-12.9.79-static\;/nix/store/bzdnjn29xj8a73wg16qrz0sswi9svp0x-libcublas-12.9.1.4\;/nix/store/62hqkwasnanq5i1j63z4clc0s4c61k1r-libcufft-11.4.1.4-static\;/nix/store/5sjldyn2vmm4ky24v1f9ggs0hps496q3-libcusolver-11.7.5.82\;/nix/store/9c924z3749bfm078bwq4ad12kjz46pjf-libcufft-11.4.1.4-lib\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev\;/nix/store/c1kdvq8xqqkwzzazl99w20h4x9z0f9pc-libcublas-12.9.1.4-lib\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev\;/nix/store/h6kzw3gvlv4sa0apb4fflpjlirhj72ga-cudnn-9.11.0.98-lib\;/nix/store/f5gvpjis5y727lw6vzr2h1zkb3hm08k2-cuda_nvrtc-12.9.86 -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages -sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages -sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages -sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/5f6dvklv5d0mvygrrf0vzp0smcn7kk01-cuda_nvtx-12.8.90\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/wa9pr3485k3mw8jhv7i9kfzjrqmdl5bb-cuda_nvtx-12.8.90-lib\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages -sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev\;/nix/store/bcvj4g3f3n6cpb6czcb5k8zdmyd94fwi-cuda_nvtx-12.6.77-lib\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/f87x0n0gi2d7rxh1ja92za2ixcw60q2p-cuda_nvtx-12.6.77\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev/include\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev/include\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev/include\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev/include\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev/include\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev/include\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev/include\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev/include\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev/include\;/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev/include\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev/include\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev/include\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev/include\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include -DCUDAToolkit_ROOT=/nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93\;/nix/store/w96jlfiy431jnsww1x3ak3chhssa3i2s-libcusparse-12.5.8.93\;/nix/store/6zj6v3b9v8xdjs94iq1228slqwr757ij-libcublas-12.8.4.1\;/nix/store/q85pndpvaqdznfijmkn0mlfp8y3v08dl-cuda_cccl-12.8.90\;/nix/store/2dc9bgppqvyd6bd5m4j9zphiyhhd39lv-libcurand-10.3.9.90-dev\;/nix/store/cwy7010iwla9b2v1fx82sp66v12r913x-libcublas-12.8.4.1-lib\;/nix/store/x6d389mfn7v413ia2had715g7rdgghgm-cuda_nvrtc-12.8.93-dev\;/nix/store/22n25ss46s0hgspdp26qk025w9m393cd-libcublas-12.8.4.1-static\;/nix/store/sc5wnfvmk0j73xdppxj25kgk8s98lscs-cuda_nvrtc-12.8.93-lib\;/nix/store/54wqrrh6qbrwmv2wkz6b216ljrqbhcji-cudnn-9.11.0.98\;/nix/store/4sz65s9xk80q9jij0i4zbp9xd1pmr3ja-libcusparse-12.5.8.93-dev\;/nix/store/11bshw90q985bpd9ds649qmgg0x54q7x-cudnn-9.11.0.98-dev\;/nix/store/8dwjdyr7y3dkqlgswpn9swz884lx62gf-cuda_cccl-12.8.90-dev\;/nix/store/1v8m3gdw08hnbs7qa4jbkflm9lg1r5q6-libcurand-10.3.9.90\;/nix/store/jc58pv1cxhvpblrnzgaai60x04q6m0bp-cuda_nvml_dev-12.8.90-lib\;/nix/store/khwhv5d4kmzjpsm785iz3sva6i9sj9r5-libcufile-1.13.1.3-static\;/nix/store/xv6c2jcc3adyqks2xl28p4r0q1g4bc92-cuda_cupti-12.8.90\;/nix/store/a2h2yfjfx0si8smnqmghw7ccj0qbnv81-cuda_cupti-12.8.90-lib\;/nix/store/4cq7zkla3djm6g5gkpzzx4gfikda2k7z-cuda_profiler_api-12.8.90-dev\;/nix/store/5f6dvklv5d0mvygrrf0vzp0smcn7kk01-cuda_nvtx-12.8.90\;/nix/store/xccbzbpcn8r506zdvhvbkqkilhlrh3c5-cuda_cudart-12.8.90-lib\;/nix/store/acbir62i1d7kvka4plmxsq8442z7r1l2-cuda_cudart-12.8.90-stubs\;/nix/store/ckkcbggf4x93zg3xn9xr00jgxs2x5p21-cuda_nvml_dev-12.8.90-static\;/nix/store/ml3bkm8bz1lnjmfd8lyxbjqpi1llasr2-libcusolver-11.7.3.90\;/nix/store/9zlrjnq7lisarny3llszk131vy816x2w-libcufile-1.13.1.3\;/nix/store/90nghg4zsrw6gki8y8hw4id3p31bc8rk-libcusolver-11.7.3.90-dev\;/nix/store/vg32acb8vlqyhkhabbgvmralfw0kwhi3-cuda_cudart-12.8.90-dev\;/nix/store/y27d2s3rcw8d17wcw23glhlj5rhs8d6y-cuda_cudart-12.8.90\;/nix/store/wa9pr3485k3mw8jhv7i9kfzjrqmdl5bb-cuda_nvtx-12.8.90-lib\;/nix/store/n96pib9yj31n031dmrrx43m61js1r5rn-cuda_nvcc-12.8.93-static\;/nix/store/pabakly3280dnghh3i89wklfm61raf7z-cuda_cupti-12.8.90-sample\;/nix/store/l0jiwp1f0dhigd41qqf408c5qyabz2vd-cudnn-9.11.0.98-static\;/nix/store/95lzbxp68m127n6hyllbr3dh2mlj7y8m-libcufft-11.3.3.83\;/nix/store/lxsd5l6hnqcfgqc1nsn8mmmpx385m3k8-libcusparse-12.5.8.93-lib\;/nix/store/vqg4r8izl1fy2smmw4dwv4x1adkj0rfb-libcufft-11.3.3.83-dev\;/nix/store/4b9rdinnksj1856siw3qmwi9f10480ii-cuda_nvrtc-12.8.93-static\;/nix/store/qh7zggir1ikzh3kvkhi2mqzpyisl4153-libcurand-10.3.9.90-static\;/nix/store/n25l4gcpw8cry4rg2a4c9jw3f53i65zd-libcusolver-11.7.3.90-lib\;/nix/store/xh73kc8spwfvd6w6wc63pyq3zm6qlrja-cuda_nvml_dev-12.8.90\;/nix/store/bgiqy1z8588hgcdzyh9brhc015w3nii0-libcurand-10.3.9.90-lib\;/nix/store/5pvax5f2dg278j43b4llkdxim9y0bjaf-cuda_nvml_dev-12.8.90-dev\;/nix/store/7lf23alvk7yh64flf2mj6smx66sqyz9d-libcufile-1.13.1.3-lib\;/nix/store/klis291y7cza60yzgkxzbid80bnyshmr-cuda_nvtx-12.8.90-dev\;/nix/store/lfqj2ni7r0ir3n840b8r1lh63mnqr0ar-libcusparse-12.5.8.93-static\;/nix/store/qmw5pq21avnfvsk657k0zr4nsgwxa4jm-cuda_cudart-12.8.90-static\;/nix/store/826d39r2b4gwafqsyhvzq2bmqv8ygzrd-cuda_profiler_api-12.8.90\;/nix/store/g52lygjflrsyr6wahpf0rvs3fpna3wq9-cudnn-9.11.0.98-lib\;/nix/store/gxw5c9f7q2f1pmy0g1zyblb8p2p891a4-libcufft-11.3.3.83-lib\;/nix/store/pbsi8w1in7q44z83ndqsaxyzfrr2frgh-cuda_nvrtc-12.8.93\;/nix/store/mps4gsnyk6s676zadvcykjxn08yghk5a-libcufile-1.13.1.3-dev\;/nix/store/mvfnbb1m20fkv2n0j69ky9s9afn8p7h1-libcufft-11.3.3.83-static\;/nix/store/8byjxgnvhcyav2283wcxp752d8280c36-libcusolver-11.7.3.90-static\;/nix/store/gz9xyhflw755r8fcxkc816fp54sj0hl4-cuda_cupti-12.8.90-dev\;/nix/store/jyd8jp3q1d408n8842rb8g6ziviwm7q1-cuda_cupti-12.8.90-static\;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/strip -DCMAKE_RANLIB=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ranlib -DCMAKE_AR=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev/include\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev/include\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev/include\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev/include\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev/include\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev/include\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev/include\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev/include\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev/include\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev/include\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev/include\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev/include\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev/include -DCUDAToolkit_ROOT=/nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85\;/nix/store/1qgrl2sgdj5m7llm2vs9690gd9998psq-cudnn-9.11.0.98\;/nix/store/d2z15dzsgfm4r2yyl16n3wc0sw8z6fia-cuda_cupti-12.6.80-lib\;/nix/store/86ngm5djfbl6a0i43j282680chqz1vr8-libcusparse-12.5.4.2-lib\;/nix/store/bmph9rbyqnyjs02zriwq78kg16h12wi6-libcublas-12.6.4.1-lib\;/nix/store/wny8xmyma0ziffas96ansxgmjfqpw393-cuda_nvrtc-12.6.85-lib\;/nix/store/j40ndiqjiqbiqrbfmgmkzz6w8757cgvk-cuda_nvml_dev-12.6.77-lib\;/nix/store/3ii532blh586xxavim32i21kr84wlcdc-cuda_profiler_api-12.6.77\;/nix/store/j32l8jnzckhdy2lzxgyd59y7p39y6b1d-libcusolver-11.7.1.2-static\;/nix/store/5iv2zpbf4k00ch4c5zfi5b8dlj90y3d3-cuda_cccl-12.6.77\;/nix/store/a8yi28jqv5185bbv10jpjja3x98i86hm-cuda_cudart-12.6.77-stubs\;/nix/store/ya85qn68jv6mlq6gh6phh5hwk3dkynag-cuda_cudart-12.6.77-static\;/nix/store/m65ribrsnk3gbabcx9ah6phgiil19j01-libcufile-1.11.1.6\;/nix/store/5f6h6xs5c74iqcjda3y73i290mfwfs9x-cuda_nvml_dev-12.6.77-dev\;/nix/store/r26q9f2lhsvimxha44g1xcck3adrdqwg-cuda_nvrtc-12.6.85-dev\;/nix/store/nj1a061pvzpq9dr65yj3jpjqcx6pr4fq-cuda_nvtx-12.6.77-dev\;/nix/store/bcvj4g3f3n6cpb6czcb5k8zdmyd94fwi-cuda_nvtx-12.6.77-lib\;/nix/store/9ik1skjb698l6vkx4m4wvx2nrr4sx0na-libcufft-11.3.0.4-dev\;/nix/store/k5rbpivsz3ilsxg91pgigp6la8ln3cv9-cuda_cupti-12.6.80\;/nix/store/vl1dficb0blxzqg6xqzfi5p119jvl2vi-libcusolver-11.7.1.2-dev\;/nix/store/f87x0n0gi2d7rxh1ja92za2ixcw60q2p-cuda_nvtx-12.6.77\;/nix/store/n7x9kkzi2jdfj6f6yjwywfhyfmn957zp-cuda_cupti-12.6.80-dev\;/nix/store/m0fwdgh4nmrjd0q9v4m2ly63qbcq2hi2-cuda_cudart-12.6.77\;/nix/store/qfaxx4b8l1alrrl0gbyb23k3j850c0v5-libcurand-10.3.7.77-static\;/nix/store/w1npzy8mfl28w7cib5idkg6nvlbzhpzq-libcufile-1.11.1.6-lib\;/nix/store/8abbm2gd77dv0l3acw0s18wln36aa0l5-cuda_cudart-12.6.77-lib\;/nix/store/ykb9bv2lqkf1wzy73q96cb04pybx9xa2-cuda_nvcc-12.6.85-static\;/nix/store/nw9ws2qvhgdb33qgfx4iqj517814qq8y-libcufft-11.3.0.4\;/nix/store/sskxmb670akk0avrahrl4r6hp7925zh8-cuda_cudart-12.6.77-dev\;/nix/store/mfc3ah6lwfd8dfbs77b0z9i75c471b0n-libcufft-11.3.0.4-static\;/nix/store/zk3cg1ws6cskrzyhdr5d68f8zrkfk77d-cuda_nvrtc-12.6.85-static\;/nix/store/pcrirrvn2ya5d3r1y18s2zj4pm2jladw-libcusolver-11.7.1.2\;/nix/store/qdn67x8jrwr418air16kwicya4d747pq-libcufft-11.3.0.4-lib\;/nix/store/dg8hyrzy7sh3wdhcr4ywsz05cvl6vfyc-libcusparse-12.5.4.2\;/nix/store/8a9vz66yzsar01lpgipmzq8skyk3ymkp-cuda_cccl-12.6.77-dev\;/nix/store/wmcrrdxd3db58nklyp7yf90kknfdx6b5-libcurand-10.3.7.77-lib\;/nix/store/xd2xrldv3lbg1bk93nr0yccy6j0vhh2k-cudnn-9.11.0.98-dev\;/nix/store/0w4g3rxgkw9r0lv737rslqdk7wldmi0n-libcurand-10.3.7.77-dev\;/nix/store/jr1397g6pshvil5n4lnvp7dm24dm71h8-libcublas-12.6.4.1-static\;/nix/store/wq0wv7df58h6bgggnz964sk8m1hbkxxp-cuda_cupti-12.6.80-sample\;/nix/store/m0s4p867fk6wk8ba7ym9yff4mayqjhlw-libcusparse-12.5.4.2-dev\;/nix/store/blh9iyvjkmwd871mfjvfhnp7njwgnc6b-cuda_profiler_api-12.6.77-dev\;/nix/store/ngwsphsxf906z7cgwg32d1w83p809ywl-cudnn-9.11.0.98-static\;/nix/store/07zlxn68jyf4s263xafnjid55grmi7a2-cuda_nvrtc-12.6.85\;/nix/store/zyh7hqq402zc7dhafhbh9vycyzcfq256-libcurand-10.3.7.77\;/nix/store/x7mww4k0zzzb7bnffv0b22jqbyf1mg3v-cuda_cupti-12.6.80-static\;/nix/store/xvlapjc6spss1kvbjlq97m6pk19hfrxz-cuda_nvml_dev-12.6.77\;/nix/store/7j4zf0r8flh7l4x5pm1mgqb2vcabmcdj-libcusolver-11.7.1.2-lib\;/nix/store/gs8gw8bgjccrjxlyzhxa7h85gkxgqwhn-libcufile-1.11.1.6-static\;/nix/store/p9dnsv7mv8mqm9aisrckq8lm3zs3l7dk-cudnn-9.11.0.98-lib\;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev\;/nix/store/dpska4iiya4xa5zzzmqzx3ljws73bnds-cuda_nvml_dev-12.6.77-static\;/nix/store/gzykkbwmch7pxgfzf86fg0b928lz6b36-libcusparse-12.5.4.2-static\;/nix/store/nqn7lvw8gbwbymdhz4nak9wf9b5bbah9-libcublas-12.6.4.1\;/nix/store/4pwy3k2s52ppzbs3k6d58kda8jhmiim4-libcufile-1.11.1.6-dev -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> cmake flags: -GNinja -DCMAKE_FIND_USE_SYSTEM_PACKAGE_REGISTRY=OFF -DCMAKE_FIND_USE_PACKAGE_REGISTRY=OFF -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DCMAKE_INSTALL_LOCALEDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/share/locale -DCMAKE_INSTALL_LIBEXECDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/libexec -DCMAKE_INSTALL_LIBDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/lib -DCMAKE_INSTALL_DOCDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/share/doc/sage_attention -DCMAKE_INSTALL_INFODIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/share/info -DCMAKE_INSTALL_MANDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/share/man -DCMAKE_INSTALL_INCLUDEDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/include -DCMAKE_INSTALL_SBINDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/sbin -DCMAKE_INSTALL_BINDIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/bin -DCMAKE_INSTALL_NAME_DIR=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/lib -DCMAKE_POLICY_DEFAULT_CMP0025=NEW -DCMAKE_FIND_FRAMEWORK=LAST -DCMAKE_STRIP=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/strip -DCMAKE_RANLIB=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ranlib -DCMAKE_AR=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/ar -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_INSTALL_PREFIX=/nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext -DPython_EXECUTABLE:STRING=/nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python -DCMAKE_CUDA_HOST_COMPILER:STRING=/nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ -DNVCC_THREADS=3 -DCUDAToolkit_INCLUDE_DIR=/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev/include\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev/include\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev/include\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev/include\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev/include\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev/include\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev/include\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev/include\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev/include\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev/include\;/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev/include\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev/include -DCUDAToolkit_ROOT=/nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86\;/nix/store/q2al0drhrl0yxk97xbsjl8d0h25kmsq9-libcurand-10.3.10.19-lib\;/nix/store/ax1ssn45048qbmyy19basgv6q64y5jy0-cuda_cupti-12.9.79\;/nix/store/m09542l6q83flp3asv2r4j3wcbjqksvg-libcufile-1.14.1.1-static\;/nix/store/b3wbcra9cziq8bwf3yhmj2nn1mf5bqy2-cuda_cudart-12.9.79-lib\;/nix/store/j5kp5fg9mn6hhslk18wbmskc7v96l353-cuda_cupti-12.9.79-static\;/nix/store/kky5wd8qwb0hx3jb3j9qc1bkwznw3z83-libcusparse-12.5.10.65-dev\;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev\;/nix/store/zsmc0yjbjrfbamm9ycrlz5yzi5hrbag1-libcurand-10.3.10.19-dev\;/nix/store/3s79bz4ldkhlks6jf9a2jd4r34y6018b-libcurand-10.3.10.19\;/nix/store/v48xzq66pzmygxqkws17n9nvpa7lad9d-cuda_nvml_dev-12.9.79\;/nix/store/6via2axi1n31n685jii6dwaiqca8b2rc-cuda_nvcc-12.9.86-static\;/nix/store/v0hx9fqdlmz9kvjd9sqr2zc141ny10yn-cuda_profiler_api-12.9.79\;/nix/store/ip4lb9ximc445dbdkdvia4whx83g00g3-libcusolver-11.7.5.82-dev\;/nix/store/8cig7k11qv5g8x0j8n2mbdfzwrnf7cg2-cuda_cudart-12.9.79-stubs\;/nix/store/xg8pj5m74n2h3v8kgxbvmbpcl90rzmlx-cudnn-9.11.0.98-static\;/nix/store/v4b7mkhyq1akczzkcyynj7y9c61l9dc7-cuda_cudart-12.9.79-static\;/nix/store/hw2swakbrvi4innrymcw8i2m98p73br0-cuda_cupti-12.9.79-sample\;/nix/store/s1i2kadnni2m4skpzzqzfzc3bpmrxi7p-libcusparse-12.5.10.65-lib\;/nix/store/81xppf0rrqfasvg7wy4z891ab473nb9v-libcufile-1.14.1.1-dev\;/nix/store/0a83zdhkh2i9d97r4zqdn8fi8vn4wfk3-libcublas-12.9.1.4-static\;/nix/store/nkvyh0qxbfj2wbm3r800xd6x1fhs1s4x-cuda_cccl-12.9.27-dev\;/nix/store/jnhjz87sm9nbnb72n54jj2l99szrzpg2-libcusparse-12.5.10.65\;/nix/store/ik96pdimvw3bjj8wdr6laxycnn5lpwby-libcufft-11.4.1.4-dev\;/nix/store/d1m6c5i6y6ncjygpdmv1b4pmd91hvjr2-cuda_cupti-12.9.79-lib\;/nix/store/49p6af3v11dcxvq9andr6l8csa2sr4j4-cuda_nvrtc-12.9.86-static\;/nix/store/bfygrgghga26l7br5d5j3h6hd1s21rkn-cudnn-9.11.0.98\;/nix/store/a6an9chi5dvjsybrfrxql0bn76xswzpa-libcufft-11.4.1.4\;/nix/store/f9r19xpj8qayy3b74gx3gbjrq0z1aq3b-cuda_nvml_dev-12.9.79-dev\;/nix/store/7zy91byrxpnyzhjlwham2gqyir2x6f54-libcusolver-11.7.5.82-lib\;/nix/store/0kycn0pb0x46h16afxw2bjrm1gjq1355-cuda_profiler_api-12.9.79-dev\;/nix/store/cx0hyla7fkqqc5hh1gn4hkarjyjvbjhf-libcusparse-12.5.10.65-static\;/nix/store/3yi8kx62nklnyn77zn4z23hi03l9c7ff-libcusolver-11.7.5.82-static\;/nix/store/z2xfln4d3r92hjjihlq5w6hvh5qhpcb4-cudnn-9.11.0.98-dev\;/nix/store/86nq76ks8vlgjdsnh1hkskyfw7mm3plc-cuda_cccl-12.9.27\;/nix/store/01ywykdxfkvp64318anifgx7zaavz9ql-cuda_nvml_dev-12.9.79-lib\;/nix/store/qv2m9i0nby2p03xx37mkkm84dlqb9s84-cuda_cudart-12.9.79\;/nix/store/a09saq5rl5jxbgv9gqllx0080ypjk00x-libcufile-1.14.1.1-lib\;/nix/store/0l18n4dhavr0p4rk0nyqqjr8paacak13-libcufile-1.14.1.1\;/nix/store/r8ly0w88qv4gw3lhd784ha0ag221c23s-cuda_nvrtc-12.9.86-lib\;/nix/store/rngn6cls1blhilrw78xb3pjgwghibhzk-libcurand-10.3.10.19-static\;/nix/store/x4w41r4jyapqwdghvi6xrpd0mnim4x08-cuda_cudart-12.9.79-dev\;/nix/store/ikw7sqic4kknjkp50dr54khgs06q1hbv-cuda_nvml_dev-12.9.79-static\;/nix/store/bzdnjn29xj8a73wg16qrz0sswi9svp0x-libcublas-12.9.1.4\;/nix/store/62hqkwasnanq5i1j63z4clc0s4c61k1r-libcufft-11.4.1.4-static\;/nix/store/5sjldyn2vmm4ky24v1f9ggs0hps496q3-libcusolver-11.7.5.82\;/nix/store/9c924z3749bfm078bwq4ad12kjz46pjf-libcufft-11.4.1.4-lib\;/nix/store/f21f8hghg4fiwa2ix29h1zy854p7q4v6-cuda_nvrtc-12.9.86-dev\;/nix/store/c1kdvq8xqqkwzzazl99w20h4x9z0f9pc-libcublas-12.9.1.4-lib\;/nix/store/ns0brisbkgrjyfi16rlyjjgcym4jk6qv-cuda_cupti-12.9.79-dev\;/nix/store/h6kzw3gvlv4sa0apb4fflpjlirhj72ga-cudnn-9.11.0.98-lib\;/nix/store/f5gvpjis5y727lw6vzr2h1zkb3hm08k2-cuda_nvrtc-12.9.86 -DPROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DProtobuf_PROTOC_EXECUTABLE=/nix/store/g82m0ia59azh4a1bcrk0r15qck6hp8da-protobuf-31.1/bin/protoc -DPYBIND11_PYTHONLIBS_OVERWRITE=OFF -DPYTHON_EXECUTABLE=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/bin/python3.13 -DPYTHON_INCLUDE_DIR=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/include/python3.13 -DPYTHON_SITE_PACKAGES=/nix/store/iyff8129iampdw13nlfqalzhxy8y1hi9-python3-3.13.6/lib/python3.13/site-packages +sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0 +sage_attention-torch-ext> -- Detecting CXX compiler ABI info sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0 sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0 -sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0 sage_attention-torch-ext> -- Detecting CXX compiler ABI info +sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0 sage_attention-torch-ext> -- Detecting CXX compiler ABI info sage_attention-torch-ext> -- Detecting CXX compiler ABI info sage_attention-torch-ext> -- The CXX compiler identification is GNU 14.3.0 -sage_attention-torch-ext> -- The CXX compiler identification is GNU 13.4.0 -sage_attention-torch-ext> -- Detecting CXX compiler ABI info sage_attention-torch-ext> -- Detecting CXX compiler ABI info sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done +sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ - skipped +sage_attention-torch-ext> -- Detecting CXX compile features +sage_attention-torch-ext> -- Detecting CXX compile features - done sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done @@ -95,12 +98,10 @@ sage_attention-torch-ext> -- Detecting CXX compiler ABI info - done sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped sage_attention-torch-ext> -- Detecting CXX compile features sage_attention-torch-ext> -- Detecting CXX compile features - done -sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ - skipped sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/d8likaw8xxdmh2qmmasbm88h74q6a2gr-gcc-wrapper-14.3.0/bin/g++ - skipped sage_attention-torch-ext> -- Detecting CXX compile features -sage_attention-torch-ext> -- Detecting CXX compile features -sage_attention-torch-ext> -- Detecting CXX compile features - done sage_attention-torch-ext> -- Detecting CXX compile features - done +sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps sage_attention-torch-ext> -- Check for working CXX compiler: /nix/store/rgfv9lch0b6ksjzlzsx0mljsb0ypqr8x-gcc-wrapper-13.4.0/bin/g++ - skipped sage_attention-torch-ext> -- Detecting CXX compile features sage_attention-torch-ext> -- Detecting CXX compile features - done @@ -111,12 +112,11 @@ sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_d sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps -sage_attention-torch-ext> -- FetchContent base directory: /build/source/build/_deps sage_attention-torch-ext> -- Found Python: /nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed -sage_attention-torch-ext> -- Found Python: /nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed +sage_attention-torch-ext> -- Found Python: /nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed sage_attention-torch-ext> -- Found Python: /nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed sage_attention-torch-ext> -- Found Python: /nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed -sage_attention-torch-ext> -- Found Python: /nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed +sage_attention-torch-ext> -- Found Python: /nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Development Development.SABIModule Interpreter Development.Module Development.Embed sage_attention-torch-ext> -- Found CUDA: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 (found version "12.8") sage_attention-torch-ext> -- Found CUDA: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 (found version "12.6") sage_attention-torch-ext> -- Found CUDA: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86 (found version "12.9") @@ -124,48 +124,44 @@ sage_attention-torch-ext> -- Found CUDA: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7 sage_attention-torch-ext> -- Found CUDA: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 (found version "12.8") sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.6.85 with host compiler GNU 13.4.0 sage_attention-torch-ext> -- Detecting CUDA compiler ABI info -sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.6.85 with host compiler GNU 13.4.0 -sage_attention-torch-ext> -- Detecting CUDA compiler ABI info sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.8.93 with host compiler GNU 14.3.0 sage_attention-torch-ext> -- Detecting CUDA compiler ABI info +sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.6.85 with host compiler GNU 13.4.0 +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.9.86 with host compiler GNU 14.3.0 sage_attention-torch-ext> -- Detecting CUDA compiler ABI info sage_attention-torch-ext> -- The CUDA compiler identification is NVIDIA 12.8.93 with host compiler GNU 14.3.0 sage_attention-torch-ext> -- Detecting CUDA compiler ABI info sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done -sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc - skipped sage_attention-torch-ext> -- Detecting CUDA compile features sage_attention-torch-ext> -- Detecting CUDA compile features - done sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done -sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc - skipped -sage_attention-torch-ext> -- Detecting CUDA compile features -sage_attention-torch-ext> -- Detecting CUDA compile features - done -sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include (found version "12.6.85") -sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include (found version "12.6.85") sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD +sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done +sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc - skipped sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc - skipped sage_attention-torch-ext> -- Detecting CUDA compile features sage_attention-torch-ext> -- Detecting CUDA compile features - done -sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/bin/nvcc - skipped sage_attention-torch-ext> -- Detecting CUDA compile features sage_attention-torch-ext> -- Detecting CUDA compile features - done sage_attention-torch-ext> -- Detecting CUDA compiler ABI info - done -sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include (found version "12.8.93") +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/include;/nix/store/fy71fffqbwg3xgvygn66kd4igj65gblv-libcublas-12.6.4.1-dev/include (found version "12.6.85") sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD -sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed -sage_attention-torch-ext> -- Looking for pthread_create in pthreads -sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include (found version "12.9.86") +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include (found version "12.8.93") sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD +sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/bin/nvcc - skipped +sage_attention-torch-ext> -- Detecting CUDA compile features +sage_attention-torch-ext> -- Detecting CUDA compile features - done sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed sage_attention-torch-ext> -- Looking for pthread_create in pthreads sage_attention-torch-ext> -- Check for working CUDA compiler: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc - skipped sage_attention-torch-ext> -- Detecting CUDA compile features sage_attention-torch-ext> -- Detecting CUDA compile features - done -sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found -sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/include;/nix/store/dd8wl3nnsigw2gj5bwaiswla97jpw1jz-libcublas-12.9.1.4-dev/include (found version "12.9.86") +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD sage_attention-torch-ext> -- Found CUDAToolkit: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/include;/nix/store/qa4d2v0lsm6giyr4b4421qsdygz0yrrh-libcublas-12.8.4.1-dev/include (found version "12.8.93") sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found @@ -174,12 +170,10 @@ sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed sage_attention-torch-ext> -- Looking for pthread_create in pthreads sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed sage_attention-torch-ext> -- Looking for pthread_create in pthreads -sage_attention-torch-ext> -- Looking for pthread_create in pthread - found -sage_attention-torch-ext> -- Found Threads: TRUE +sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed +sage_attention-torch-ext> -- Looking for pthread_create in pthreads sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found sage_attention-torch-ext> -- Looking for pthread_create in pthread -sage_attention-torch-ext> -- Looking for pthread_create in pthread - found -sage_attention-torch-ext> -- Found Threads: TRUE sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found sage_attention-torch-ext> -- Looking for pthread_create in pthread sage_attention-torch-ext> -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed @@ -188,6 +182,12 @@ sage_attention-torch-ext> -- Looking for pthread_create in pthread - found sage_attention-torch-ext> -- Found Threads: TRUE sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Looking for pthread_create in pthreads - not found +sage_attention-torch-ext> -- Looking for pthread_create in pthread +sage_attention-torch-ext> -- Looking for pthread_create in pthread - found +sage_attention-torch-ext> -- Found Threads: TRUE +sage_attention-torch-ext> -- Looking for pthread_create in pthread - found +sage_attention-torch-ext> -- Found Threads: TRUE sage_attention-torch-ext> -- Looking for pthread_create in pthread - found sage_attention-torch-ext> -- Found Threads: TRUE sage_attention-torch-ext> -- Looking for pthread_create in pthread - found @@ -195,15 +195,15 @@ sage_attention-torch-ext> -- Found Threads: TRUE sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.6 sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 +sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8 +sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc +sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.6 sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85/bin/nvcc sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/7iw4ipbdy17yzvqjhxpw03i17kq7f7rj-cuda_nvcc-12.6.85 sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.9 sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86/bin/nvcc sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8zrv6h6f2cfz34pwq012n4cx2zrv5m1s-cuda_nvcc-12.9.86 -sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8 -sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc -sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 sage_attention-torch-ext> -- PyTorch: Header version is: 12.6 sage_attention-torch-ext> -- Found Python: /nix/store/r3gwdvvsgl1csl12f4pkhz0jhsch7bdy-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter sage_attention-torch-ext> CMake Warning at /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): @@ -219,7 +219,6 @@ sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPA sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90 -sage_attention-torch-ext> -- PyTorch: Header version is: 12.6 sage_attention-torch-ext> CMake Warning at /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. sage_attention-torch-ext> Call Stack (most recent call first): @@ -227,10 +226,29 @@ sage_attention-torch-ext> /nix/store/ld6fk094jhhsnbip1406vrky9lmyxbax-python3. sage_attention-torch-ext> CMakeLists.txt:30 (find_package) sage_attention-torch-ext> sage_attention-torch-ext> +sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8 +sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc +sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 sage_attention-torch-ext> -- Found Torch: /nix/store/pg32mpjmckfs38anjzgyvk2ljfw12pb3-python3.13-torch-2.8.0-lib/lib/libtorch.so sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.8 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.6 +sage_attention-torch-ext> -- Found Python: /nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter sage_attention-torch-ext> -- Found Python: /nix/store/j6r6hpjs8p5m4s3i8cqqavg62fd5z48g-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter +sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): +sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) +sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> +sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support +sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support +sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support +sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support +sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120 sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so sage_attention-torch-ext> Call Stack (most recent call first): @@ -276,6 +294,13 @@ sage_attention-torch-ext> CMakeLists.txt:30 (find_package) sage_attention-torch-ext> sage_attention-torch-ext> sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90 +sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): +sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. +sage_attention-torch-ext> Call Stack (most recent call first): +sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found) +sage_attention-torch-ext> CMakeLists.txt:30 (find_package) +sage_attention-torch-ext> +sage_attention-torch-ext> sage_attention-torch-ext> CMake Warning at /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. sage_attention-torch-ext> Call Stack (most recent call first): @@ -283,14 +308,13 @@ sage_attention-torch-ext> /nix/store/dzz5brlw0xzs9hp3v8fvvwcvkmsr3ls9-python3. sage_attention-torch-ext> CMakeLists.txt:30 (find_package) sage_attention-torch-ext> sage_attention-torch-ext> +sage_attention-torch-ext> -- Found Torch: /nix/store/mrq1wi2biib2p1mks17g8g5sc4fd492r-python3.13-torch-2.8.0-lib/lib/libtorch.so +sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 +sage_attention-torch-ext> -- PyTorch: Header version is: 12.9 sage_attention-torch-ext> -- Found Torch: /nix/store/8sicfhvzq84gnxiwybyjgp80pcynamzn-python3.13-torch-2.7.1-lib/lib/libtorch.so sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0 -sage_attention-torch-ext> -- PyTorch: CUDA detected: 12.8 -sage_attention-torch-ext> -- PyTorch: CUDA nvcc is: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93/bin/nvcc -sage_attention-torch-ext> -- PyTorch: CUDA toolkit directory: /nix/store/8kyv8ffbfvksnqmm1kaz0llysg7dpn9z-cuda_nvcc-12.8.93 -sage_attention-torch-ext> -- PyTorch: Header version is: 12.9 -sage_attention-torch-ext> -- PyTorch: Header version is: 12.8 sage_attention-torch-ext> -- Found Python: /nix/store/aikr517kmcd8r2nrrj70jq71d7352qiq-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter sage_attention-torch-ext> CMake Warning at /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so @@ -304,20 +328,6 @@ sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN supp sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support -sage_attention-torch-ext> -- Found Python: /nix/store/qal2apcjwlw2p2kk05dwqdgzh8ml687l-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter -sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120 -sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): -sage_attention-torch-ext> Failed to compute shorthash for libnvrtc.so -sage_attention-torch-ext> Call Stack (most recent call first): -sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake:86 (include) -sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:68 (find_package) -sage_attention-torch-ext> CMakeLists.txt:30 (find_package) -sage_attention-torch-ext> -sage_attention-torch-ext> -sage_attention-torch-ext> -- USE_CUDNN is set to 0. Compiling without cuDNN support -sage_attention-torch-ext> -- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support -sage_attention-torch-ext> -- USE_CUDSS is set to 0. Compiling without cuDSS support -sage_attention-torch-ext> -- USE_CUFILE is set to 0. Compiling without cuFile support sage_attention-torch-ext> -- Added CUDA NVCC flags for: -gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_100,code=sm_100;-gencode;arch=compute_101,code=sm_101;-gencode;arch=compute_120,code=sm_120 sage_attention-torch-ext> CMake Warning at /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. @@ -326,19 +336,9 @@ sage_attention-torch-ext> /nix/store/483ma0klnbln74izv5jiyila52bfwqxh-python3. sage_attention-torch-ext> CMakeLists.txt:30 (find_package) sage_attention-torch-ext> sage_attention-torch-ext> -sage_attention-torch-ext> CMake Warning at /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message): -sage_attention-torch-ext> static library kineto_LIBRARY-NOTFOUND not found. -sage_attention-torch-ext> Call Stack (most recent call first): -sage_attention-torch-ext> /nix/store/6drs80sxjhskdki55g5k1dw0jzbd258w-python3.13-torch-2.8.0/lib/python3.13/site-packages/torch/share/cmake/Torch/TorchConfig.cmake:125 (append_torchlib_if_found) -sage_attention-torch-ext> CMakeLists.txt:30 (find_package) -sage_attention-torch-ext> -sage_attention-torch-ext> sage_attention-torch-ext> -- Found Torch: /nix/store/zccgvlbr93bhyia3sr9f2mddmkp2jyx7-python3.13-torch-2.8.0-lib/lib/libtorch.so sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 -sage_attention-torch-ext> -- Found Torch: /nix/store/mrq1wi2biib2p1mks17g8g5sc4fd492r-python3.13-torch-2.8.0-lib/lib/libtorch.so -sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 -sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 sage_attention-torch-ext> -- PyTorch: Header version is: 12.8 sage_attention-torch-ext> -- Found Python: /nix/store/wirj6dihrpcch7idfd7jy4l0hqfsgkk1-python3-3.13.6-env/bin/python (found version "3.13.6") found components: Interpreter sage_attention-torch-ext> CMake Warning at /nix/store/4ww34a0xcdm3baaz7y2dnrr38r2yjwwx-python3.13-torch-2.7.1/lib/python3.13/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake:146 (message): @@ -396,12 +396,12 @@ sage_attention-torch-ext> sage_attention-torch-ext> -- Found Torch: /nix/store/35sj4in2ddx47klyg96qmkpd4vh8py94-python3.13-torch-2.7.1-lib/lib/libtorch.so sage_attention-torch-ext> -- CUDA target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 sage_attention-torch-ext> -- CUDA supported target architectures: 7.0;7.5;8.0;8.6;8.9;9.0;10.0;10.1;12.0 -sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 -sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 -sage_attention-torch-ext> -- Configuring done (9.3s) +sage_attention-torch-ext> -- Configuring done (9.4s) sage_attention-torch-ext> -- Generating done (0.0s) sage_attention-torch-ext> CMake Warning: sage_attention-torch-ext> Manually-specified variables were not used by the project: @@ -433,12 +433,12 @@ sage_attention-torch-ext> cmake: enabled parallel building sage_attention-torch-ext> cmake: enabled parallel installing sage_attention-torch-ext> Running phase: buildPhase sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a -sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 -sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 -sage_attention-torch-ext> -- Configuring done (9.4s) +sage_attention-torch-ext> -- Configuring done (9.5s) sage_attention-torch-ext> -- Generating done (0.0s) sage_attention-torch-ext> CMake Warning: sage_attention-torch-ext> Manually-specified variables were not used by the project: @@ -470,12 +470,18 @@ sage_attention-torch-ext> cmake: enabled parallel building sage_attention-torch-ext> cmake: enabled parallel installing sage_attention-torch-ext> Running phase: buildPhase sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a -sage_attention-torch-ext> -- Configuring done (9.4s) +sage_attention-torch-ext> -- Configuring done (9.5s) +sage_attention-torch-ext> -- Configuring done (9.6s) sage_attention-torch-ext> -- Generating done (0.0s) sage_attention-torch-ext> CMake Warning: sage_attention-torch-ext> Manually-specified variables were not used by the project: @@ -503,16 +509,6 @@ sage_attention-torch-ext> Protobuf_PROTOC_EXECUTABLE sage_attention-torch-ext> sage_attention-torch-ext> sage_attention-torch-ext> -- Build files have been written to: /build/source/build -sage_attention-torch-ext> cmake: enabled parallel building -sage_attention-torch-ext> cmake: enabled parallel installing -sage_attention-torch-ext> Running phase: buildPhase -sage_attention-torch-ext> build flags: -j21 -sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 -sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a -sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 -sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 -sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 -sage_attention-torch-ext> -- Configuring done (9.5s) sage_attention-torch-ext> -- Generating done (0.0s) sage_attention-torch-ext> CMake Warning: sage_attention-torch-ext> Manually-specified variables were not used by the project: @@ -544,12 +540,16 @@ sage_attention-torch-ext> cmake: enabled parallel building sage_attention-torch-ext> cmake: enabled parallel installing sage_attention-torch-ext> Running phase: buildPhase sage_attention-torch-ext> build flags: -j21 +sage_attention-torch-ext> cmake: enabled parallel building +sage_attention-torch-ext> cmake: enabled parallel installing +sage_attention-torch-ext> Running phase: buildPhase +sage_attention-torch-ext> build flags: -j21 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm89: 8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm80: 8.0 sage_attention-torch-ext> -- Capabilities for kernel _fused: 9.0a;8.0;8.9 -sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 sage_attention-torch-ext> -- Capabilities for kernel _qattn_sm90: 9.0a -sage_attention-torch-ext> -- Configuring done (9.9s) +sage_attention-torch-ext> -- Capabilities for kernel _qattn: 9.0a;8.0;8.9 +sage_attention-torch-ext> -- Configuring done (9.8s) sage_attention-torch-ext> -- Generating done (0.0s) sage_attention-torch-ext> CMake Warning: sage_attention-torch-ext> Manually-specified variables were not used by the project: @@ -581,3890 +581,5086 @@ sage_attention-torch-ext> cmake: enabled parallel building sage_attention-torch-ext> cmake: enabled parallel installing sage_attention-torch-ext> Running phase: buildPhase sage_attention-torch-ext> build flags: -j21 -sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/torch-ext/torch_binding.cpp.o -sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/torch-ext/torch_binding.cpp.o -sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/torch-ext/torch_binding.cpp.o -sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/torch-ext/torch_binding.cpp.o -sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/torch-ext/torch_binding.cpp.o -sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_1369690_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_1369690_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_1369690_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_1369690_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [1/12] Building CXX object CMakeFiles/_sage_attention_1369690_dirty.dir/torch-ext/torch_binding.cpp.o +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 589.615 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 587.369 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 593.451 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 593.487 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 585.325 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 584.058 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 547.358 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 535.674 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 543.831 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 544.308 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 543.627 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 546.697 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 586.599 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 545.861 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 585.711 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 487.314 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 482.640 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 542.011 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 457.024 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 453.257 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 617.863 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 573.264 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 621.067 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 497.092 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 490.878 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 574.183 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 452.252 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 450.873 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 618.598 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 581.817 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 620.754 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 566.486 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 315.800 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 313.400 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.900 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 323.623 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 315.497 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 314.046 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.355 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 323.571 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 338.518 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 335.776 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 342.095 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 344.389 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 336.715 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 338.056 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 343.889 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 341.851 ms -sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 285.493 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 286.733 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.970 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 253.284 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 292.503 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 291.856 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.089 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.094 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 306.229 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 306.386 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 312.993 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 314.081 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 307.194 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 307.151 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 310.611 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> ptxas info : Compile time = 311.148 ms +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/fused/fused.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 33.163 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 29.798 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 25.454 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 24.426 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.628 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.532 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.466 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.505 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 4.399 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 4.402 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 3.699 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 3.664 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.805 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.824 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.822 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.834 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.798 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.737 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 469.625 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 463.912 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 571.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 569.671 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 561.608 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 561.001 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 556.035 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 557.200 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 479.684 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 481.712 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 603.697 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 598.014 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 480.607 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 477.343 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 590.169 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 581.773 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.737 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.406 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.741 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 243.416 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.702 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.739 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.341 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 233.883 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.939 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.936 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.683 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.138 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.912 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.904 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.751 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.040 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.498 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.645 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.543 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.790 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.486 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.670 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.154 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 260.577 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 261.259 ms +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.754 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 170.445 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 11.051 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 170.922 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 178.615 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 173.053 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.485 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.675 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.890 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 169.070 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 173.540 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 172.860 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.861 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 176.685 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 10.674 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 174.977 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 181.300 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 179.394 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.790 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 173.679 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.746 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 174.799 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 179.923 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 180.588 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.777 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.492 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 11.445 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.720 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.093 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.763 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.181 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.682 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.240 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.290 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.730 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.856 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.114 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 111.653 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 11.251 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.112 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 117.481 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 116.677 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.052 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.242 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.064 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.828 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 116.094 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 116.392 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.040 ms -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.453 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 22.917 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.250 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 161.338 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.956 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 21.303 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 16.938 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 155.991 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 17.139 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 158.280 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.362 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.147 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.465 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.361 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.333 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 166.793 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 169.915 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.781 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.264 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.936 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.236 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.500 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 169.078 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 170.510 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.177 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 104.161 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.136 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 104.761 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.125 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.052 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.730 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 104.397 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.698 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 104.650 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.489 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.521 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.144 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.758 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.094 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.970 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 117.195 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 123.621 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.185 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.699 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.197 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.730 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 114.895 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.441 ms +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.164 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.704 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.299 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.152 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.838 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.200 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.188 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 165.854 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.118 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 165.334 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 171.384 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.936 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.921 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.754 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.223 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.901 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.686 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.149 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.230 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 172.498 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.249 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 169.288 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 176.070 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 174.334 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 9.688 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.530 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.245 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 170.021 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 175.010 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 178.739 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.220 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.287 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.298 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.746 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.637 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.102 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 9.794 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.971 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.254 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.396 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.978 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.550 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.338 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.782 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.208 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.261 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.849 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 114.214 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.283 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.221 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.479 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.147 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.909 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.612 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.457 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 151.667 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.433 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 153.445 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.690 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.319 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.211 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 151.849 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.480 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 152.070 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.359 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.937 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.393 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.221 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.393 ms -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 30.117 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 28.478 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.340 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.004 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.129 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 22.777 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 161.468 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 22.734 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.516 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.971 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 165.774 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.503 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 101.825 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.364 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 101.679 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.124 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.857 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.330 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 101.784 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.313 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 101.641 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 105.927 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 104.587 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.250 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.968 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.214 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 108.193 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.581 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.614 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.777 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.781 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.821 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.417 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.877 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.774 ms +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.237 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.341 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.329 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.243 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.286 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.833 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.308 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.235 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 826.557 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 816.161 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 828.861 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 826.193 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 816.337 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 808.415 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 801.612 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 815.667 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 707.437 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 714.530 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1001.111 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1010.184 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 767.697 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 787.498 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 966.520 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1003.302 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.949 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 398.364 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.196 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 412.057 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 438.811 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 452.307 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.187 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 444.824 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.105 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 438.867 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 417.958 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 437.697 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.920 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 469.650 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.290 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 462.005 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 467.912 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 464.076 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.341 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 459.537 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.325 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 447.576 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.094 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 443.545 ms +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 590.184 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 587.826 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 596.714 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 602.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 580.377 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 576.021 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 585.040 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 586.444 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 525.571 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 525.233 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 628.043 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 627.475 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 503.080 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 511.243 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 960.108 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1038.392 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 9.831 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 397.404 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.338 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 411.247 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 418.848 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 424.521 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.308 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 404.732 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.339 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 9.887 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.336 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.359 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.343 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.267 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.465 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.501 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.414 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 407.096 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 424.318 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 415.598 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.287 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.793 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.495 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 447.662 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 458.394 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 456.104 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.463 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 444.661 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.366 ms -sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 444.848 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 459.398 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 464.293 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/fused/fused.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : 11 bytes gmem sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 54.421 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 47.949 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 40.944 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 39.316 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.232 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 6.566 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.159 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.136 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 7.061 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 6.939 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 5.816 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 5.689 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.363 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.382 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.423 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.503 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.361 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.247 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.299 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.188 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 19.787 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 14.026 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.787 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 14.131 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 19.009 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.657 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.523 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.761 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 17.134 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.366 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.538 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.357 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 17.425 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.341 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.212 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.247 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 18.561 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.023 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.779 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.803 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 17.507 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.371 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.707 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compile time = 12.546 ms +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 36.806 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 34.855 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 27.632 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 27.810 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 5.357 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 5.366 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 5.246 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 5.272 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 5.183 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.962 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.507 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.381 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.430 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.333 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.380 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.435 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.494 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.143 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.615 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.365 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 17.454 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.094 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.174 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.068 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 17.697 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.099 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.042 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.930 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 14.211 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.913 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.271 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.302 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.686 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.103 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.086 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.123 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.082 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.140 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.702 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.294 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.023 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.350 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.314 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compile time = 7.240 ms +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 29.520 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 28.105 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.644 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 23.426 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.446 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.349 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.296 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.317 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.316 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.230 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.816 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.882 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.155 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.172 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.182 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.123 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.209 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.742 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.336 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.164 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.758 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.309 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.158 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.174 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.852 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.224 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.222 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.196 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.587 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.197 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.321 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.244 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.772 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.255 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.304 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.249 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.109 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.325 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.350 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.384 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.139 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.422 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.451 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> ptxas info : Compile time = 7.282 ms +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 575.531 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 573.134 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 477.685 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.948 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 549.043 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 547.290 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 554.362 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 552.915 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 480.589 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 485.373 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 609.560 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 961.641 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 839.593 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 833.012 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1012.245 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1012.331 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 389.256 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 387.388 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 393.773 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 397.407 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 387.125 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 384.227 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 392.006 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 395.280 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 429.918 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 410.430 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 417.292 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 419.824 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 405.015 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 404.581 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 402.388 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 387.610 ms -sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced -sage_attention-torch-ext> half *sO = (half*)smem_; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> ptxas info : 10 bytes gmem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 999.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1008.310 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1010.558 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1022.908 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 993.110 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 986.279 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 894.060 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 598.193 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 539.079 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 525.486 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 636.814 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 643.895 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 515.792 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 516.644 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 636.165 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 633.284 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 243.158 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.717 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.472 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.914 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.256 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.694 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.876 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 254.324 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 271.323 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.801 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 279.458 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 276.534 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.084 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.328 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 279.058 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 275.557 ms +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 472.368 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 468.433 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 620.508 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 616.107 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 576.750 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 772.153 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 571.920 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 774.453 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 560.549 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 782.307 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 564.460 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 766.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 557.690 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 763.804 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 560.404 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 774.962 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 479.494 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 655.297 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 485.787 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 658.864 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 606.847 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 818.494 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 604.332 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 834.707 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 483.856 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 674.744 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 482.782 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 694.457 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 593.808 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 934.425 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 586.016 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 931.353 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 236.466 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 354.255 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 235.744 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 343.199 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 241.740 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Compile time = 370.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 399.378 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 334.581 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 375.615 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 359.839 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 410.376 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 411.621 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 383.662 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 439.704 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 437.275 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 446.328 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 437.873 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 430.902 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 298.938 ms +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 242.647 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 236.979 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 235.385 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 243.210 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 247.594 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 261.556 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 260.161 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.085 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.489 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 259.951 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 258.783 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 267.131 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.264 ms -sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced -sage_attention-torch-ext> half *sO = (half*)smem_; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> ptxas info : 11 bytes gmem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 790.771 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 792.881 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 980.108 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 977.646 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 966.776 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 851.252 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 556.777 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 558.364 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 490.315 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 521.780 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 619.202 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 611.598 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 488.636 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 495.391 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 603.818 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 603.052 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 279.184 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 243.290 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 271.092 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 280.604 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 283.624 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.699 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.971 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.242 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 270.122 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.108 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 269.930 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 281.979 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 278.125 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 287.994 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 285.258 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 295.081 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 291.477 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 281.872 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.020 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.400 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 244.384 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 284.176 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 295.775 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 291.576 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 271.587 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 175.895 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.148 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 271.074 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 264.960 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 179.338 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 181.282 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 186.177 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.943 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 176.846 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.571 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 300.543 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 458.510 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 178.792 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 181.990 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 184.368 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 186.550 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 185.837 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 197.424 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 193.863 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 189.459 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 187.229 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 192.644 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 195.045 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 255.230 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 255.710 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 262.885 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 263.839 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 256.653 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 253.572 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 263.145 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 262.722 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 266.776 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 266.834 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 274.130 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 272.791 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 271.980 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 265.778 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 275.642 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 278.223 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 998.552 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 826.635 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 615.748 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 606.650 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 603.297 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 593.839 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 603.450 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 602.685 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 501.419 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.502 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 632.978 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 631.913 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 499.363 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 497.475 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 624.370 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 628.275 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 172.490 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 317.593 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 176.868 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 179.363 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 182.724 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 337.409 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 338.980 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 325.425 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 172.738 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 321.102 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 176.483 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 179.536 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 181.599 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 330.768 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 332.775 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 325.485 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 184.437 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 341.080 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 187.298 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 193.122 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 195.503 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 339.245 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 346.842 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 346.461 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 187.034 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 344.575 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 184.630 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 191.288 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 189.145 ms -sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 338.607 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 346.476 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 345.178 ms +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced -sage_attention-torch-ext> half *sO = (half*)smem_; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> ptxas info : 10 bytes gmem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 749.212 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 735.567 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 898.978 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 918.429 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 914.085 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 895.687 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 864.867 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 914.954 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 793.578 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 786.775 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1000.400 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 962.812 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 790.596 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 780.630 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 884.545 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 915.802 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 379.958 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 386.163 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 391.533 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 386.332 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 377.969 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 375.346 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 388.152 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 387.354 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 418.584 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 416.620 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 422.887 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 418.367 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 412.537 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 412.410 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 420.907 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 422.828 ms +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 968.796 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 997.981 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1003.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 976.590 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 986.321 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 988.531 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 960.047 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 983.177 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 841.694 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 835.789 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1054.160 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1071.933 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 809.331 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 815.178 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 792.847 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 606.934 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 310.416 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 305.370 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 313.186 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 315.267 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 332.934 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 319.704 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 310.698 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 314.573 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 327.737 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 326.537 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 332.464 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 333.084 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 327.492 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 326.340 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 333.489 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 334.503 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 581.614 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 579.645 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 566.773 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 824.023 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 943.343 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 951.713 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 948.674 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 943.709 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 839.100 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 832.107 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1057.496 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1076.174 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 870.170 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 858.787 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1044.382 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1058.303 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 399.733 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 391.304 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 406.681 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 402.870 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 396.301 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 387.645 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 401.033 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 401.514 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 436.194 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 426.740 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 437.151 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 437.547 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 428.190 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 432.674 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 441.354 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 441.692 ms +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 866.249 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 862.299 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 873.782 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 875.831 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 852.270 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 851.476 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 880.964 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 863.392 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 772.722 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 777.240 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 926.046 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 931.904 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 749.924 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 758.203 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 946.039 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1016.690 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 405.534 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 412.191 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 403.205 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 387.725 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 387.513 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 435.032 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 397.812 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 438.839 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 443.567 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 445.955 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 461.657 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 467.032 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 483.735 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 488.336 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 497.941 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 474.535 ms -sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 68 bytes spill stores, 52 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 268 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 260 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 256 bytes spill stores, 260 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 232 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 292 bytes spill stores, 292 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 228 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 733.104 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 731.085 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 918.992 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 918.653 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 915.256 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 912.595 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 909.402 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 909.714 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 772.525 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 775.748 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 967.721 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 961.232 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 775.724 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 780.124 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 950.656 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 954.345 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 375.330 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 381.202 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 390.741 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 388.969 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 375.466 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 384.207 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 383.898 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 388.396 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 422.907 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 419.306 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 424.313 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 429.593 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 419.164 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 420.692 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 425.393 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 423.774 ms -sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 876.597 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 972.582 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 987.989 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 995.288 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 994.117 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 982.604 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 981.111 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 970.839 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 846.454 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 849.497 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1031.414 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1048.652 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 835.362 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 829.827 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1017.734 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1023.163 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 414.881 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.112 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.384 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.290 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.246 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.309 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.676 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.916 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.735 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 266.398 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.132 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 389.197 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 434.975 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 446.924 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 455.440 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 458.558 ms +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf @@ -4530,333 +5726,301 @@ sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_s sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 993.379 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 983.666 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 991.494 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads +sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 995.815 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 984.680 ms +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 977.430 ms +sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 996.712 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 997.032 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 817.076 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 810.474 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1036.490 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 737.972 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 496.712 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 487.620 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 619.107 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 621.681 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 315.178 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 314.475 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.573 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 322.633 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 353.122 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 310.499 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.386 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.051 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 337.406 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 337.050 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 342.868 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 344.463 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 336.617 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 334.307 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 346.369 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 345.404 ms -sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 970.903 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 969.569 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 992.699 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 993.083 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 958.046 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 972.919 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 951.359 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 948.223 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 820.379 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 829.366 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 598.749 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 596.165 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1012.789 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 501.088 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1003.097 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 813.168 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 817.103 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 497.756 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 573.002 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 581.095 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 588.939 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 584.204 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 998.870 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 513.593 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1007.322 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 390.714 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 394.228 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 413.533 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 510.562 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 656.214 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 643.416 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 527.443 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 525.280 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 662.868 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 655.125 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 243.729 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.722 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 408.229 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.248 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 248.027 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 387.351 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.174 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 379.782 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.441 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 380.296 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.898 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 381.719 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.742 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 417.380 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.427 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 426.731 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.558 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 456.481 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.852 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 460.196 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 272.929 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 412.414 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 261.214 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.761 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 264.042 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 272.479 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 274.820 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 373.993 ms -sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 274.232 ms +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' @@ -4987,3501 +6151,2511 @@ sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_s sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/fused/fused.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 53.435 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 47.765 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 41.736 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 40.130 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 7.070 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 6.856 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 6.834 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced -sage_attention-torch-ext> half *sO = (half*)smem_; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> ptxas info : 11 bytes gmem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 6.975 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 287.845 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 6.991 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 288.839 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 295.844 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 294.602 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 6.724 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 285.738 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 5.817 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 288.852 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 291.369 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 291.160 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compile time = 5.687 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 298.561 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.931 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 295.403 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 304.758 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 303.732 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.173 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 291.363 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.894 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 294.677 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 299.220 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 220.822 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.015 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 109.424 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.008 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 144.841 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 194.590 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 194.206 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.857 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 189.093 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.741 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 188.492 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 196.098 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 195.004 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.547 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 197.927 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 19.737 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 196.450 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 200.944 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 214.660 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.572 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 199.900 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 14.067 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 197.293 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 205.149 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 211.099 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.535 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 271.682 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 19.165 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 269.586 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 281.733 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 277.306 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 269.288 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.434 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 274.055 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 276.263 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 276.553 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 13.280 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 286.596 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 16.255 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 280.377 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 291.432 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 294.755 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.843 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 281.571 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.201 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 280.015 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 294.676 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 289.791 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.213 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 183.860 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 15.513 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 182.850 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 194.201 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 188.360 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.053 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 183.835 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.232 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 182.459 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 194.673 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 188.311 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 11.991 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 190.778 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 18.070 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 192.092 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 206.914 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 205.892 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.546 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 192.388 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.369 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 192.225 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 202.459 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 200.399 ms -sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.080 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 16.883 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.496 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.565 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 12.891 ms sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 989.903 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 975.144 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 994.561 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1001.132 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 961.806 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 977.769 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 973.138 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 976.834 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 836.467 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 863.853 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1048.856 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1044.048 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 838.270 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 842.047 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1035.935 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 769.188 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 240.795 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 36.100 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 244.068 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.699 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.391 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 33.911 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 243.355 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 26.779 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 249.877 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.939 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 251.494 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 27.032 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.645 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 5.253 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.193 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.712 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 270.373 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.779 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.379 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.743 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 262.208 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 271.118 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.399 ms -sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 983.256 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 988.467 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 818.638 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 807.287 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 951.565 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 953.206 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 957.961 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 960.994 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 832.750 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 831.568 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1073.839 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1058.976 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 861.042 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 862.609 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1053.474 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1067.258 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 5.058 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 395.895 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.976 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 393.060 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 401.657 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 404.080 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.813 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 392.307 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.293 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 391.283 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 403.284 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 408.327 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 4.116 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 437.440 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.428 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 435.068 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 442.290 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 440.862 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.336 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 382.225 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.905 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.894 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 259.214 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 257.814 ms -sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 864.240 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 888.745 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 994.582 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 942.025 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 938.553 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 911.255 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 984.241 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1004.147 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 939.994 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 900.768 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 948.356 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 633.607 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 753.372 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 850.671 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1081.473 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1069.612 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.222 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 389.144 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.267 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 420.446 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 428.036 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 426.218 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.589 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 414.409 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.818 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 416.887 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 428.684 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 423.411 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.261 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 425.723 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 17.417 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 418.213 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 465.548 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 468.465 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.709 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 457.939 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.789 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 461.659 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.348 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 455.041 ms -sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.587 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 17.321 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.960 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 12.834 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 13.048 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 15.219 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.313 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.251 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 740.709 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 732.739 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 939.937 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 928.398 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 924.225 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 924.967 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 932.908 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 920.612 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 778.830 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 485.143 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 577.442 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 575.445 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 464.965 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 466.262 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 777.273 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 954.850 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.574 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 392.003 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 15.609 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 395.791 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 403.954 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 402.820 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.386 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 392.565 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.207 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 372.358 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 392.180 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 398.389 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.373 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 427.351 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 15.991 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 426.514 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 431.372 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 432.750 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.489 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 420.279 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.332 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 247 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 414.583 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 389.395 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 414.831 ms -sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 264 bytes stack frame, 276 bytes spill stores, 280 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 264 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 288 bytes stack frame, 268 bytes spill stores, 268 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 288 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 272 bytes spill stores, 276 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 272 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 272 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 224 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 224 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 216 bytes spill stores, 232 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 216 bytes stack frame, 244 bytes spill stores, 236 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 216 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 256 bytes stack frame, 212 bytes spill stores, 224 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 256 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.467 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 16.061 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.227 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.531 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 11.228 ms +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 29.465 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 235 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 27.737 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.151 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 242 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 772.097 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 12 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 764.307 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 963.047 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 974.212 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 953.195 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 950.212 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 956.490 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 944.377 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 800.398 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 822.177 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1017.284 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1026.028 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 818.620 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 812.880 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1011.282 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 987.653 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 22.760 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 401.829 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.458 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 402.088 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 406.141 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 408.398 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.357 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 400.601 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.243 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 399.658 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 410.496 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 410.007 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.209 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 436.062 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.185 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 433.903 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 442.161 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 441.720 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 3.101 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 429.612 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.695 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 430.207 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 437.492 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 437.610 ms -sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 2.691 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.151 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.375 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.427 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 230 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.392 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.428 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.915 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.532 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 237 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 232 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 24 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 88 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 24 bytes stack frame, 36 bytes spill stores, 20 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 24 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.538 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.748 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.055 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.089 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.733 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.817 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.169 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.169 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 567.956 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 555.349 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 566.194 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 568.174 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 770.150 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 936.199 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 976.778 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 964.694 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 793.000 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 801.383 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1027.038 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1015.406 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 746.223 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 809.856 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1023.711 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1014.847 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 8.111 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 506.221 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.587 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 529.578 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 445.260 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 444.036 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 514.511 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 514.736 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 443.054 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 411.922 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.140 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 491.946 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.164 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 499.711 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 553.157 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 553.380 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 542.357 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 541.216 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 545.634 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 539.954 ms -sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/fused/fused.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 10 bytes gmem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.172 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.131 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.202 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.163 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.177 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 9.982 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.300 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.334 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.348 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 10.073 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.380 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.375 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compile time = 7.222 ms +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/fused/fused.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem +sage_attention-torch-ext> ptxas info : 10 bytes gmem sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 49.500 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 50.131 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 42.722 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 40.751 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.398 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.277 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.262 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.268 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 7.002 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 6.967 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 5.811 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 5.598 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.524 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.467 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.386 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.471 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.348 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 11.941 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.091 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.620 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 20.600 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 14.674 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 14.472 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 14.295 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 19.425 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 14.192 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 13.751 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 14.187 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 17.681 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.777 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.765 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.743 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 17.817 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.346 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.249 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.044 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 18.788 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.942 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 13.131 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.947 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 17.883 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.814 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 13.122 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 13.020 ms -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 37.329 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 34.626 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 26.932 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 27.830 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.291 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.140 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.046 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.110 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.232 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 4.433 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.859 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.886 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.258 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.055 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.095 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.789 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.907 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.512 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.032 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.792 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 16.606 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.263 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 14.055 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 14.197 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 17.163 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.638 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.375 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.070 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 14.833 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.874 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.909 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.978 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 15.043 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.951 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.474 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.900 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 16.332 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.250 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.182 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.199 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 15.566 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.156 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.166 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.136 ms -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 49.535 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 47.652 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 37.672 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 38.065 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.538 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.435 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.551 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.370 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.281 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 5.141 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 4.571 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 4.491 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.627 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.487 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.597 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.487 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.691 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.329 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.763 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.627 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 17.892 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 13.243 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 13.226 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 13.182 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 17.771 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 13.294 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 13.345 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 13.287 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 15.862 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.622 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.658 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.690 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 15.902 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.653 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.698 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.623 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 16.319 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.940 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.290 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.074 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 16.693 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.039 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.142 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 12.028 ms -sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/fused/fused.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 931.635 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 937.720 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 959.342 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 931.747 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 912.236 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 972.129 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 974.563 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 974.417 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 817.109 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 783.288 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 1016.236 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 958.440 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 725.558 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 788.638 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 999.569 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 988.327 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : 10 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 500.264 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 497.664 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 433.531 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 434.755 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 507.129 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 506.114 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 438.820 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 432.929 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 526.397 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 525.905 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 550.307 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 523.220 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 529.581 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 533.405 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 536.842 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 533.635 ms -sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 534.947 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 523.143 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 529.204 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 534.048 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 530.867 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 530.137 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 535.064 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 533.832 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 456.808 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 450.443 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 564.432 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 566.371 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 452.925 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 453.794 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 562.106 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 563.019 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 926.291 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 988.834 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 998.212 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 981.730 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1010.516 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 970.592 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 983.619 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 950.009 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 732.815 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 791.506 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1026.285 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 1020.836 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 800.657 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 788.173 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 996.209 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 996.247 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 501.649 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 504.874 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 434.974 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 430.997 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 500.588 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 489.681 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 434.422 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 433.981 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 542.961 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 539.602 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 544.148 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 551.551 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.760 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 538.659 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 534.637 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 520.703 ms +sage_attention-torch-ext> [2/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced +sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced +sage_attention-torch-ext> half *sO = (half*)smem_; +sage_attention-torch-ext> ^ +sage_attention-torch-ext> +sage_attention-torch-ext> ptxas info : 28 bytes gmem +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.629 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.500 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.109 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 168.494 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 163.334 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.936 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.919 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 167.494 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 171.728 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 172.263 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 171.204 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 170.337 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.521 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.146 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 169.215 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 169.211 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.110 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.017 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.913 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.820 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.073 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 102.760 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.418 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.151 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 107.918 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.250 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.272 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.196 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.622 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.072 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 113.129 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 112.985 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 151.866 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 152.247 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 157.337 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.475 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 152.326 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 151.880 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 156.948 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 157.435 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.109 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 159.365 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 164.546 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 165.551 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 160.853 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.161 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 162.325 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 161.610 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 281.333 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 100.696 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 276.018 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.588 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.696 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 281.997 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 282.024 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.247 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.254 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 100.100 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.793 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.643 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 304.576 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 99.585 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 302.526 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 311.316 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 312.464 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 100.444 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 104.224 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 103.844 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 303.259 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.050 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 304.059 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 305.252 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 308.140 ms -sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.060 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 110.421 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.914 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.169 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 106.104 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.643 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem +sage_attention-torch-ext> ptxas info : Compile time = 109.592 ms +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' @@ -8949,1062 +9123,888 @@ sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_ sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 452.454 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 443.502 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 532.956 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 537.629 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 534.217 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 531.217 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 530.849 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 533.586 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 460.908 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 465.682 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 569.037 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 573.063 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 465.808 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 458.740 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 561.391 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 563.473 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 234.014 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 232.164 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 237.732 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 239.603 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 235.259 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 230.945 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 237.116 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 234.590 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.623 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 249.389 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.105 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 255.004 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 247.180 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.331 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.542 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' +sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 252.644 ms -sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu.o +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 28 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/_sage_attention_1369690_dirty/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext/sage_attention/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/bgkz7fkw3xnhpn3s9xl0mh6hzw50igaz-sage_attention-torch-ext +sage_attention-torch-ext> Running phase: installCheckPhase +sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing +sage_attention-torch-ext> Checking of ABI compatibility +sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 +sage_attention-torch-ext> ✅ No compatibility issues found +sage_attention-torch-ext> Checking loading kernel with get_kernel +sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention +sage_attention-torch-ext> [3/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(627): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> Remark: The warnings can be suppressed with "-diag-suppress " -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(803): warning #177-D: variable "padded_kv_len" was declared but never referenced -sage_attention-torch-ext> int qo_len, kv_len, padded_kv_len, num_qo_heads, num_kv_heads; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> /build/source/sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu(170): warning #177-D: variable "sO" was declared but never referenced -sage_attention-torch-ext> half *sO = (half*)smem_; -sage_attention-torch-ext> ^ -sage_attention-torch-ext> -sage_attention-torch-ext> ptxas info : 28 bytes gmem -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 171.789 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 168.792 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 172.899 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 174.321 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 168.434 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 168.177 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 173.675 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 172.640 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 176.953 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 177.873 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 176.568 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 176.119 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 169.429 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 169.082 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 174.087 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 174.500 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 106.503 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 106.435 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 109.738 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 109.539 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 106.161 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 105.674 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 109.413 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 109.181 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 111.543 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 436.735 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 112.125 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 116.869 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 117.121 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 431.396 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 533.219 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 530.670 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 112.579 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 521.712 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 112.528 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 116.400 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb1EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 116.183 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 526.479 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 523.878 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 518.861 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.431 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 460.510 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.292 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.013 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 456.477 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 457.482 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.059 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.122 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 156.785 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 226.781 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 156.825 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 161.773 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 161.445 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 229.174 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.113 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 234.133 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 156.797 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 226.809 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 156.440 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 161.382 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 161.569 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 227.034 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 229.475 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 229.017 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 164.343 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.178 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 163.968 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 169.839 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 170.616 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.310 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.848 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.673 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 165.412 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 245.746 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 167 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 166.605 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 166.927 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj128EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 168 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 166.854 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.534 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.643 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.444 ms +sage_attention-torch-ext> [4/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 559.443 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.409 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 568.894 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 567.309 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 555.844 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 554.456 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 550.486 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 549.836 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 486.794 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 487.416 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 592.195 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 591.013 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 484.986 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 489.331 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 590.586 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 585.963 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 102.785 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 239.921 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 103.132 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 106.373 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 106.519 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.653 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 246.697 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.332 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 103.314 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.879 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 103.217 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 106.862 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode0ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 106.648 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 241.201 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 247.318 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 249.491 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 108.480 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 267.306 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 108.160 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 113.455 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity3ELS0_3E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 113.071 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.643 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 270.400 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 268.505 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 109.215 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb0ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 263.997 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 109.279 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E13__nv_bfloat16L8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 113.087 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf' for 'sm_90a' -sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int8_sv_f8_attn_kernelILj64ELj128ELj128ELj64EL16QuantGranularity2ELS0_2E6__halfL8MaskMode1ELb1ELb0EEv14CUtensorMap_stS3_S3_PfS4_S4_PT5_S4_jjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 128 registers, used 1 barriers, 16 bytes cumulative stack size, 128 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 113.033 ms -sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> buildPhase completed in 3 minutes 30 seconds -sage_attention-torch-ext> Running phase: installPhase -sage_attention-torch-ext> install flags: -j21 install -sage_attention-torch-ext> [0/1] Install the project... -sage_attention-torch-ext> -- Install configuration: "Release" -sage_attention-torch-ext> -- Installing: /nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/_sage_attention_af2d0c0_dirty/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> Running phase: fixupPhase -sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext -sage_attention-torch-ext> shrinking /nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> checking for references to /build/ in /nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext... -sage_attention-torch-ext> patching script interpreter paths in /nix/store/2bvjs99wvlawr8lk16ihaa9vsjigppcw-sage_attention-torch-ext -sage_attention-torch-ext> Running phase: installCheckPhase -sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing -sage_attention-torch-ext> Checking of ABI compatibility -sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 -sage_attention-torch-ext> ✅ No compatibility issues found -sage_attention-torch-ext> Checking loading kernel with get_kernel -sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention -sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 262.643 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 269.183 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 267.816 ms +sage_attention-torch-ext> [5/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 558.324 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 556.812 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 469.632 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 471.587 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 536.044 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 535.069 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 536.844 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 532.802 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 540.558 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.283 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 479.128 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 593.466 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 595.802 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 495.385 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 495.046 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 594.391 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 592.701 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 524.131 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 532.873 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 531.895 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 529.212 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 84 bytes spill stores, 56 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 530.045 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 530.147 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 529.409 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 453.739 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 64 bytes spill stores, 48 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.804 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 565.533 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 566.111 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 451.967 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 36 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 450.567 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 559.371 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 557.747 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 230.920 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 279.939 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 228.210 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 277.584 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 233.322 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 246.335 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 235.323 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 227.194 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 227.237 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 246.460 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 281.817 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 281.227 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 231.856 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 246.478 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 235.053 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 245.385 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 254.284 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 303.254 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.838 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 302.164 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 256.750 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 308.583 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 256.867 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 309.322 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.230 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 299.838 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.416 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 299.171 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 255.372 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 304.632 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 254.559 ms -sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> ptxas info : Compile time = 303.234 ms +sage_attention-torch-ext> [6/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 554.827 ms +sage_attention-torch-ext> ptxas info : Compile time = 573.985 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 555.616 ms +sage_attention-torch-ext> ptxas info : Compile time = 566.523 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 558.617 ms +sage_attention-torch-ext> ptxas info : Compile time = 569.328 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 76 bytes spill stores, 44 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 557.441 ms +sage_attention-torch-ext> ptxas info : Compile time = 570.085 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 550.462 ms +sage_attention-torch-ext> ptxas info : Compile time = 561.603 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 28 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 550.848 ms +sage_attention-torch-ext> ptxas info : Compile time = 560.633 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 555.254 ms +sage_attention-torch-ext> ptxas info : Compile time = 566.286 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 16 bytes spill stores, 12 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 552.498 ms +sage_attention-torch-ext> ptxas info : Compile time = 564.440 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 469.257 ms +sage_attention-torch-ext> ptxas info : Compile time = 476.978 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 470.105 ms +sage_attention-torch-ext> ptxas info : Compile time = 478.590 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 580.429 ms +sage_attention-torch-ext> ptxas info : Compile time = 591.063 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 584.866 ms +sage_attention-torch-ext> ptxas info : Compile time = 594.559 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 473.537 ms +sage_attention-torch-ext> ptxas info : Compile time = 481.319 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 40 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 471.687 ms +sage_attention-torch-ext> ptxas info : Compile time = 477.812 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 579.194 ms +sage_attention-torch-ext> ptxas info : Compile time = 590.875 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 583.672 ms +sage_attention-torch-ext> ptxas info : Compile time = 594.283 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 296.443 ms +sage_attention-torch-ext> ptxas info : Compile time = 300.753 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 295.757 ms +sage_attention-torch-ext> ptxas info : Compile time = 301.565 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 300.175 ms +sage_attention-torch-ext> ptxas info : Compile time = 308.950 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 304.243 ms +sage_attention-torch-ext> ptxas info : Compile time = 308.883 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 292.201 ms +sage_attention-torch-ext> ptxas info : Compile time = 300.010 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 289.473 ms +sage_attention-torch-ext> ptxas info : Compile time = 296.211 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 300.834 ms +sage_attention-torch-ext> ptxas info : Compile time = 305.394 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 300.674 ms +sage_attention-torch-ext> ptxas info : Compile time = 307.656 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 319.536 ms +sage_attention-torch-ext> ptxas info : Compile time = 325.958 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 316.038 ms +sage_attention-torch-ext> ptxas info : Compile time = 321.870 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.601 ms +sage_attention-torch-ext> ptxas info : Compile time = 328.642 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.080 ms +sage_attention-torch-ext> ptxas info : Compile time = 329.535 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 314.583 ms +sage_attention-torch-ext> ptxas info : Compile time = 321.379 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 315.980 ms +sage_attention-torch-ext> ptxas info : Compile time = 322.064 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.983 ms +sage_attention-torch-ext> ptxas info : Compile time = 329.599 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 321.031 ms -sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o +sage_attention-torch-ext> ptxas info : Compile time = 328.798 ms +sage_attention-torch-ext> [7/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 571.810 ms +sage_attention-torch-ext> ptxas info : Compile time = 581.629 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 571.291 ms +sage_attention-torch-ext> ptxas info : Compile time = 581.290 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 578.094 ms +sage_attention-torch-ext> ptxas info : Compile time = 589.654 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 581.878 ms +sage_attention-torch-ext> ptxas info : Compile time = 592.633 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 558.359 ms +sage_attention-torch-ext> ptxas info : Compile time = 568.978 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 553.480 ms +sage_attention-torch-ext> ptxas info : Compile time = 563.291 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 563.480 ms +sage_attention-torch-ext> ptxas info : Compile time = 575.280 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 28 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 566.917 ms +sage_attention-torch-ext> ptxas info : Compile time = 574.299 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 515.458 ms +sage_attention-torch-ext> ptxas info : Compile time = 523.598 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 508.550 ms +sage_attention-torch-ext> ptxas info : Compile time = 517.873 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 611.868 ms +sage_attention-torch-ext> ptxas info : Compile time = 619.219 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 619.209 ms +sage_attention-torch-ext> ptxas info : Compile time = 623.755 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 499.153 ms +sage_attention-torch-ext> ptxas info : Compile time = 504.730 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 80 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 495.273 ms +sage_attention-torch-ext> ptxas info : Compile time = 506.378 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 600.253 ms +sage_attention-torch-ext> ptxas info : Compile time = 613.005 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 80 bytes stack frame, 60 bytes spill stores, 48 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 597.701 ms +sage_attention-torch-ext> ptxas info : Compile time = 607.654 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 235.006 ms +sage_attention-torch-ext> ptxas info : Compile time = 239.654 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 251 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 239.640 ms +sage_attention-torch-ext> ptxas info : Compile time = 244.360 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.234 ms +sage_attention-torch-ext> ptxas info : Compile time = 251.023 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.114 ms +sage_attention-torch-ext> ptxas info : Compile time = 248.952 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 236.012 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.613 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 238.932 ms +sage_attention-torch-ext> ptxas info : Compile time = 243.398 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 243.348 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.045 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 242.750 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.971 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 259.797 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.475 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 259.453 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.001 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.116 ms +sage_attention-torch-ext> ptxas info : Compile time = 270.604 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.742 ms +sage_attention-torch-ext> ptxas info : Compile time = 269.738 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 259.722 ms +sage_attention-torch-ext> ptxas info : Compile time = 264.063 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 259.610 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.972 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 266.354 ms +sage_attention-torch-ext> ptxas info : Compile time = 270.710 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.664 ms -sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> ptxas info : Compile time = 270.725 ms +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 10 bytes gmem, 80 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' @@ -10519,1006 +10519,843 @@ sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_s sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> buildPhase completed in 3 minutes 45 seconds -sage_attention-torch-ext> Running phase: installPhase -sage_attention-torch-ext> install flags: -j21 install -sage_attention-torch-ext> [0/1] Install the project... -sage_attention-torch-ext> -- Install configuration: "Release" -sage_attention-torch-ext> -- Installing: /nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/_sage_attention_af2d0c0_dirty/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> Running phase: fixupPhase -sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext -sage_attention-torch-ext> shrinking /nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> checking for references to /build/ in /nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext... -sage_attention-torch-ext> patching script interpreter paths in /nix/store/0pmiqd0nyndanj9rwlfnry7dzb3ad6cs-sage_attention-torch-ext -sage_attention-torch-ext> Running phase: installCheckPhase -sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing -sage_attention-torch-ext> Checking of ABI compatibility -sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 -sage_attention-torch-ext> ✅ No compatibility issues found -sage_attention-torch-ext> Checking loading kernel with get_kernel -sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention -sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 577.793 ms +sage_attention-torch-ext> ptxas info : Compile time = 562.853 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 540.100 ms +sage_attention-torch-ext> ptxas info : Compile time = 527.005 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 581.375 ms +sage_attention-torch-ext> ptxas info : Compile time = 568.085 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 549.599 ms +sage_attention-torch-ext> ptxas info : Compile time = 538.394 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 449.505 ms +sage_attention-torch-ext> ptxas info : Compile time = 441.941 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 436.913 ms +sage_attention-torch-ext> ptxas info : Compile time = 427.811 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 568.259 ms +sage_attention-torch-ext> ptxas info : Compile time = 556.406 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 539.045 ms +sage_attention-torch-ext> ptxas info : Compile time = 526.843 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 582.831 ms +sage_attention-torch-ext> ptxas info : Compile time = 569.964 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 556.473 ms +sage_attention-torch-ext> ptxas info : Compile time = 544.242 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 593.871 ms +sage_attention-torch-ext> ptxas info : Compile time = 579.888 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 567.721 ms +sage_attention-torch-ext> ptxas info : Compile time = 552.194 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 582.074 ms +sage_attention-torch-ext> ptxas info : Compile time = 570.639 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 553.902 ms +sage_attention-torch-ext> ptxas info : Compile time = 539.988 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 591.265 ms +sage_attention-torch-ext> ptxas info : Compile time = 577.328 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 561.780 ms +sage_attention-torch-ext> ptxas info : Compile time = 550.457 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 241.858 ms +sage_attention-torch-ext> ptxas info : Compile time = 236.914 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 234.288 ms +sage_attention-torch-ext> ptxas info : Compile time = 229.279 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 264.094 ms +sage_attention-torch-ext> ptxas info : Compile time = 257.592 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 255.340 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.369 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 256.270 ms +sage_attention-torch-ext> ptxas info : Compile time = 251.428 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 247.355 ms +sage_attention-torch-ext> ptxas info : Compile time = 242.814 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 266.133 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.717 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 255.430 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.173 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 267.176 ms +sage_attention-torch-ext> ptxas info : Compile time = 259.338 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 254.734 ms +sage_attention-torch-ext> ptxas info : Compile time = 248.624 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 283.957 ms +sage_attention-torch-ext> ptxas info : Compile time = 276.039 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 275.099 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.139 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 276.139 ms +sage_attention-torch-ext> ptxas info : Compile time = 268.385 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 267.366 ms +sage_attention-torch-ext> ptxas info : Compile time = 259.953 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 283.937 ms +sage_attention-torch-ext> ptxas info : Compile time = 276.906 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 274.177 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.899 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 197.088 ms +sage_attention-torch-ext> ptxas info : Compile time = 191.865 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 196.570 ms +sage_attention-torch-ext> ptxas info : Compile time = 192.940 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 206.151 ms +sage_attention-torch-ext> ptxas info : Compile time = 201.324 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 213.025 ms +sage_attention-torch-ext> ptxas info : Compile time = 207.815 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 215.405 ms +sage_attention-torch-ext> ptxas info : Compile time = 211.630 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 217.156 ms +sage_attention-torch-ext> ptxas info : Compile time = 211.194 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 222.719 ms +sage_attention-torch-ext> ptxas info : Compile time = 217.068 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 215.432 ms +sage_attention-torch-ext> ptxas info : Compile time = 209.590 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 221.762 ms +sage_attention-torch-ext> ptxas info : Compile time = 215.403 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 220.206 ms +sage_attention-torch-ext> ptxas info : Compile time = 214.776 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 225.431 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.586 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 225.805 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.993 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 219.814 ms +sage_attention-torch-ext> ptxas info : Compile time = 215.176 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 220.003 ms +sage_attention-torch-ext> ptxas info : Compile time = 214.525 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 224.826 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.478 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 224.445 ms +sage_attention-torch-ext> ptxas info : Compile time = 218.865 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 266.497 ms +sage_attention-torch-ext> ptxas info : Compile time = 259.508 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 264.516 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.299 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 273.296 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.840 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 271.292 ms +sage_attention-torch-ext> ptxas info : Compile time = 264.617 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.256 ms +sage_attention-torch-ext> ptxas info : Compile time = 238.380 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 244.541 ms +sage_attention-torch-ext> ptxas info : Compile time = 238.551 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 273.650 ms +sage_attention-torch-ext> ptxas info : Compile time = 264.369 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 270.650 ms +sage_attention-torch-ext> ptxas info : Compile time = 264.371 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 285.458 ms +sage_attention-torch-ext> ptxas info : Compile time = 276.182 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 285.065 ms +sage_attention-torch-ext> ptxas info : Compile time = 277.184 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 292.409 ms +sage_attention-torch-ext> ptxas info : Compile time = 284.611 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 292.646 ms +sage_attention-torch-ext> ptxas info : Compile time = 283.427 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 284.793 ms +sage_attention-torch-ext> ptxas info : Compile time = 274.751 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 292.497 ms +sage_attention-torch-ext> ptxas info : Compile time = 280.771 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 301.876 ms +sage_attention-torch-ext> ptxas info : Compile time = 290.338 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 300.200 ms +sage_attention-torch-ext> ptxas info : Compile time = 290.616 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 552.634 ms +sage_attention-torch-ext> ptxas info : Compile time = 534.780 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 518.553 ms +sage_attention-torch-ext> ptxas info : Compile time = 502.975 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 554.599 ms +sage_attention-torch-ext> ptxas info : Compile time = 535.653 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 526.619 ms +sage_attention-torch-ext> ptxas info : Compile time = 509.089 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 433.431 ms +sage_attention-torch-ext> ptxas info : Compile time = 416.771 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 410.252 ms +sage_attention-torch-ext> ptxas info : Compile time = 394.961 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 545.211 ms +sage_attention-torch-ext> ptxas info : Compile time = 525.418 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 517.720 ms +sage_attention-torch-ext> ptxas info : Compile time = 500.357 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 561.874 ms +sage_attention-torch-ext> ptxas info : Compile time = 541.948 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 533.685 ms +sage_attention-torch-ext> ptxas info : Compile time = 517.746 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 571.367 ms +sage_attention-torch-ext> ptxas info : Compile time = 556.254 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 544.202 ms +sage_attention-torch-ext> ptxas info : Compile time = 525.692 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 559.312 ms +sage_attention-torch-ext> ptxas info : Compile time = 543.210 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 532.679 ms +sage_attention-torch-ext> ptxas info : Compile time = 517.263 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 570.656 ms +sage_attention-torch-ext> ptxas info : Compile time = 553.876 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 542.606 ms +sage_attention-torch-ext> ptxas info : Compile time = 529.800 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.739 ms +sage_attention-torch-ext> ptxas info : Compile time = 239.195 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 237.643 ms +sage_attention-torch-ext> ptxas info : Compile time = 231.484 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 252.617 ms +sage_attention-torch-ext> ptxas info : Compile time = 245.930 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 243.258 ms +sage_attention-torch-ext> ptxas info : Compile time = 237.639 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 244.601 ms +sage_attention-torch-ext> ptxas info : Compile time = 237.991 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 236.755 ms +sage_attention-torch-ext> ptxas info : Compile time = 231.052 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 252.411 ms +sage_attention-torch-ext> ptxas info : Compile time = 244.852 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 242.202 ms +sage_attention-torch-ext> ptxas info : Compile time = 236.286 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 262.965 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.189 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.665 ms +sage_attention-torch-ext> ptxas info : Compile time = 246.841 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.733 ms +sage_attention-torch-ext> ptxas info : Compile time = 264.735 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 261.897 ms +sage_attention-torch-ext> ptxas info : Compile time = 254.389 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.350 ms +sage_attention-torch-ext> ptxas info : Compile time = 257.184 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 254.564 ms +sage_attention-torch-ext> ptxas info : Compile time = 248.351 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.621 ms +sage_attention-torch-ext> ptxas info : Compile time = 264.825 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 260.465 ms +sage_attention-torch-ext> ptxas info : Compile time = 255.375 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 479.113 ms +sage_attention-torch-ext> ptxas info : Compile time = 468.930 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 473.189 ms +sage_attention-torch-ext> ptxas info : Compile time = 465.496 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 484.132 ms +sage_attention-torch-ext> ptxas info : Compile time = 476.175 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 488.505 ms +sage_attention-torch-ext> ptxas info : Compile time = 482.976 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 471.002 ms +sage_attention-torch-ext> ptxas info : Compile time = 461.953 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 474.086 ms +sage_attention-torch-ext> ptxas info : Compile time = 462.593 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.979 ms +sage_attention-torch-ext> ptxas info : Compile time = 464.028 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.733 ms +sage_attention-torch-ext> ptxas info : Compile time = 464.144 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 490.581 ms +sage_attention-torch-ext> ptxas info : Compile time = 479.779 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 488.138 ms +sage_attention-torch-ext> ptxas info : Compile time = 477.497 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 500.404 ms +sage_attention-torch-ext> ptxas info : Compile time = 488.379 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 499.312 ms +sage_attention-torch-ext> ptxas info : Compile time = 487.885 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 489.706 ms +sage_attention-torch-ext> ptxas info : Compile time = 476.546 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 490.358 ms +sage_attention-torch-ext> ptxas info : Compile time = 475.871 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 495.879 ms +sage_attention-torch-ext> ptxas info : Compile time = 483.043 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 496.330 ms +sage_attention-torch-ext> ptxas info : Compile time = 483.460 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.881 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.263 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 247.385 ms +sage_attention-torch-ext> ptxas info : Compile time = 241.654 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 252.723 ms +sage_attention-torch-ext> ptxas info : Compile time = 248.729 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 252.201 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.941 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.891 ms +sage_attention-torch-ext> ptxas info : Compile time = 241.143 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.237 ms +sage_attention-torch-ext> ptxas info : Compile time = 241.092 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.792 ms +sage_attention-torch-ext> ptxas info : Compile time = 248.311 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.898 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.667 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.262 ms +sage_attention-torch-ext> ptxas info : Compile time = 257.780 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 264.953 ms +sage_attention-torch-ext> ptxas info : Compile time = 257.679 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 272.009 ms +sage_attention-torch-ext> ptxas info : Compile time = 266.174 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 272.403 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.164 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 264.150 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.362 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 264.396 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.199 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 272.029 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.725 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 272.217 ms -sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> buildPhase completed in 3 minutes 45 seconds -sage_attention-torch-ext> Running phase: installPhase -sage_attention-torch-ext> install flags: -j21 install -sage_attention-torch-ext> [0/1] Install the project... -sage_attention-torch-ext> -- Install configuration: "Release" -sage_attention-torch-ext> -- Installing: /nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/_sage_attention_af2d0c0_dirty/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn.cu.o +sage_attention-torch-ext> ptxas info : Compile time = 264.555 ms +sage_attention-torch-ext> [8/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_fuse_v_scale_attn.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 561.017 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 557.379 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 454.116 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 8 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 450.317 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 570.749 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 12 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 537.396 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 567.461 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 555.859 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 556.222 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 536.104 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 536.949 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 551.024 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 541.424 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 551.660 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 541.008 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 488.288 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 468.252 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 490.506 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 471.225 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 593.538 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 80 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads +sage_attention-torch-ext> ptxas info : Compile time = 573.867 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 44 bytes spill stores, 36 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 592.085 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 574.696 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 486.291 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 469.310 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 490.670 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 592.515 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 466.100 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 587.756 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 240.688 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 245 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 242.615 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.619 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 249.134 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 241.375 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 240.270 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.635 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.753 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 267.005 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 234 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.744 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 270.121 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.735 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.789 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 261.852 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 266.241 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb1ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.157 ms -sage_attention-torch-ext> Running phase: fixupPhase -sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext -sage_attention-torch-ext> shrinking /nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f32_attn.cu.o -sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used -sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 438.883 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 433.179 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 533.744 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 48 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 530.959 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 522.545 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 525.052 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 520.863 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 518.957 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 456.437 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 459.820 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 556.469 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 554.382 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 456.700 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 40 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 456.712 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 552.185 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 550.740 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 567.081 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 569.499 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 225.932 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 236.441 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 239 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 227.209 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 233.616 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 238 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 232.576 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 227.053 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf -sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 236 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 226.230 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 233.116 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 229.246 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 240.673 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 229 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 226.943 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 242.201 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.074 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 237.993 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 241 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 243.551 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 232.422 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.879 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 239.934 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 231 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.206 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 235.001 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 244.705 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 251.274 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 243 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.472 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Compile time = 251.161 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 251.829 ms -sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' -sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 256.662 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 249.809 ms -sage_attention-torch-ext> checking for references to /build/ in /nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext... -sage_attention-torch-ext> patching script interpreter paths in /nix/store/mkd1kn188s2i4xnh80z6397w35dcn0b9-sage_attention-torch-ext +sage_attention-torch-ext> ptxas info : Used 244 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 257.186 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.344 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 240 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.082 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.857 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb1ELb0ELb0EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 233 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.752 ms +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 42 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/_sage_attention_1369690_dirty/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext/sage_attention/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/fh2zcbpbd5xsajp0dqgwmpw3hk9kfcdv-sage_attention-torch-ext +sage_attention-torch-ext> Running phase: installCheckPhase +sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing +sage_attention-torch-ext> Checking of ABI compatibility +sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 and Python ABI version 3.9 +sage_attention-torch-ext> ✅ No compatibility issues found +sage_attention-torch-ext> Checking loading kernel with get_kernel +sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 43 seconds +sage_attention-torch-ext> Running phase: installPhase +sage_attention-torch-ext> install flags: -j21 install +sage_attention-torch-ext> [0/1] Install the project... +sage_attention-torch-ext> -- Install configuration: "Release" +sage_attention-torch-ext> -- Installing: /nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/_sage_attention_1369690_dirty/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> Running phase: fixupPhase +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext/sage_attention/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/8c8qa08gh6b21c551d4z6z6z1zp15jqr-sage_attention-torch-ext sage_attention-torch-ext> Running phase: installCheckPhase sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing sage_attention-torch-ext> Checking of ABI compatibility @@ -11526,661 +11363,824 @@ sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 an sage_attention-torch-ext> ✅ No compatibility issues found sage_attention-torch-ext> Checking loading kernel with get_kernel sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention -sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> [9/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/sm89_qk_int8_sv_f8_accum_f16_attn_inst_buf.cu.o +sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used +sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 565.646 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 88 bytes stack frame, 156 bytes spill stores, 148 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 88 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 561.894 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 472.572 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 80 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 80 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 473.206 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.302 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 16 bytes stack frame, 32 bytes spill stores, 24 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 16 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 537.818 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 540.711 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 48 bytes stack frame, 32 bytes spill stores, 20 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 48 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 541.268 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 479.122 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 481.888 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 596.281 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 92 bytes spill stores, 88 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 595.258 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 496.615 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 96 bytes stack frame, 176 bytes spill stores, 164 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 495.946 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 596.177 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 112 bytes stack frame, 136 bytes spill stores, 128 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 595.430 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 232.529 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.048 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.291 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.955 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 230.271 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 228.865 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 235.306 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode0ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 236.916 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.490 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 251.641 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.345 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.210 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 250.628 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb0ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 252.510 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 258.441 ms +sage_attention-torch-ext> ptxas info : Compiling entry function '_Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf' for 'sm_89' +sage_attention-torch-ext> ptxas info : Function properties for _Z24qk_int_sv_f8_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit1EL8MaskMode1ELb1ELb0ELb0ELb1EEvPaS5_S5_PT9_PfS8_S8_S8_S8_jjjjjjjjjjjjjjjf +sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 488 bytes cmem[0], 8 bytes cmem[2] +sage_attention-torch-ext> ptxas info : Compile time = 255.946 ms +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 11 bytes gmem, 88 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 576.356 ms +sage_attention-torch-ext> ptxas info : Compile time = 564.271 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 540.449 ms +sage_attention-torch-ext> ptxas info : Compile time = 528.211 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 579.980 ms +sage_attention-torch-ext> ptxas info : Compile time = 567.301 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 550.877 ms +sage_attention-torch-ext> ptxas info : Compile time = 537.719 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 450.748 ms +sage_attention-torch-ext> ptxas info : Compile time = 440.514 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 436.310 ms +sage_attention-torch-ext> ptxas info : Compile time = 426.754 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 569.668 ms +sage_attention-torch-ext> ptxas info : Compile time = 553.227 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 539.170 ms +sage_attention-torch-ext> ptxas info : Compile time = 526.362 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 584.251 ms +sage_attention-torch-ext> ptxas info : Compile time = 569.300 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 556.690 ms +sage_attention-torch-ext> ptxas info : Compile time = 544.159 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 596.932 ms +sage_attention-torch-ext> ptxas info : Compile time = 581.426 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 567.192 ms +sage_attention-torch-ext> ptxas info : Compile time = 551.856 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 585.485 ms +sage_attention-torch-ext> ptxas info : Compile time = 570.959 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 555.673 ms +sage_attention-torch-ext> ptxas info : Compile time = 542.792 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 595.002 ms +sage_attention-torch-ext> ptxas info : Compile time = 580.170 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 568.517 ms +sage_attention-torch-ext> ptxas info : Compile time = 552.088 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.328 ms +sage_attention-torch-ext> ptxas info : Compile time = 237.978 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 237.060 ms +sage_attention-torch-ext> ptxas info : Compile time = 230.451 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 267.650 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.427 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 257.376 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.941 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 261.832 ms +sage_attention-torch-ext> ptxas info : Compile time = 251.539 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.087 ms +sage_attention-torch-ext> ptxas info : Compile time = 242.539 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 267.565 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.921 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 256.431 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.075 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.458 ms +sage_attention-torch-ext> ptxas info : Compile time = 259.891 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 257.270 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.931 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 286.448 ms +sage_attention-torch-ext> ptxas info : Compile time = 278.386 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 277.447 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.722 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 278.484 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.410 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 270.991 ms +sage_attention-torch-ext> ptxas info : Compile time = 259.995 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 285.680 ms +sage_attention-torch-ext> ptxas info : Compile time = 277.844 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 279.070 ms +sage_attention-torch-ext> ptxas info : Compile time = 268.692 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 197.591 ms +sage_attention-torch-ext> ptxas info : Compile time = 192.172 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 199.109 ms +sage_attention-torch-ext> ptxas info : Compile time = 191.798 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 207.939 ms +sage_attention-torch-ext> ptxas info : Compile time = 202.309 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 215.415 ms +sage_attention-torch-ext> ptxas info : Compile time = 208.954 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 220.332 ms +sage_attention-torch-ext> ptxas info : Compile time = 210.573 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 218.548 ms +sage_attention-torch-ext> ptxas info : Compile time = 212.166 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 226.299 ms +sage_attention-torch-ext> ptxas info : Compile time = 217.046 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 217.250 ms +sage_attention-torch-ext> ptxas info : Compile time = 211.601 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 222.585 ms +sage_attention-torch-ext> ptxas info : Compile time = 215.271 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 223.601 ms +sage_attention-torch-ext> ptxas info : Compile time = 215.576 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 227.492 ms +sage_attention-torch-ext> ptxas info : Compile time = 220.112 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 227.957 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.610 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 222.685 ms +sage_attention-torch-ext> ptxas info : Compile time = 214.027 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 222.169 ms +sage_attention-torch-ext> ptxas info : Compile time = 214.428 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 227.019 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.749 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 226.143 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.153 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.444 ms +sage_attention-torch-ext> ptxas info : Compile time = 260.099 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 267.393 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.387 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 275.941 ms +sage_attention-torch-ext> ptxas info : Compile time = 268.371 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 274.105 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.606 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 247.487 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.062 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.803 ms +sage_attention-torch-ext> ptxas info : Compile time = 239.485 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 275.106 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.143 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 274.792 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.449 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 287.780 ms +sage_attention-torch-ext> ptxas info : Compile time = 278.266 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 287.554 ms +sage_attention-torch-ext> ptxas info : Compile time = 277.525 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 294.568 ms +sage_attention-torch-ext> ptxas info : Compile time = 285.609 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 294.883 ms +sage_attention-torch-ext> ptxas info : Compile time = 286.443 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 285.699 ms +sage_attention-torch-ext> ptxas info : Compile time = 278.234 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 292.333 ms +sage_attention-torch-ext> ptxas info : Compile time = 286.253 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 302.119 ms +sage_attention-torch-ext> ptxas info : Compile time = 294.132 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 300.418 ms +sage_attention-torch-ext> ptxas info : Compile time = 294.033 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 555.557 ms +sage_attention-torch-ext> ptxas info : Compile time = 540.389 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 519.550 ms +sage_attention-torch-ext> ptxas info : Compile time = 509.537 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 551.355 ms +sage_attention-torch-ext> ptxas info : Compile time = 540.716 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 524.203 ms +sage_attention-torch-ext> ptxas info : Compile time = 514.407 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 429.088 ms +sage_attention-torch-ext> ptxas info : Compile time = 422.445 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 406.496 ms +sage_attention-torch-ext> ptxas info : Compile time = 400.817 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 541.189 ms +sage_attention-torch-ext> ptxas info : Compile time = 531.424 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 515.693 ms +sage_attention-torch-ext> ptxas info : Compile time = 505.847 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 561.090 ms +sage_attention-torch-ext> ptxas info : Compile time = 549.355 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 535.625 ms +sage_attention-torch-ext> ptxas info : Compile time = 521.168 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 571.384 ms +sage_attention-torch-ext> ptxas info : Compile time = 557.945 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 541.083 ms +sage_attention-torch-ext> ptxas info : Compile time = 532.220 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 557.912 ms +sage_attention-torch-ext> ptxas info : Compile time = 546.641 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 531.773 ms +sage_attention-torch-ext> ptxas info : Compile time = 519.650 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 569.410 ms +sage_attention-torch-ext> ptxas info : Compile time = 558.367 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 541.200 ms +sage_attention-torch-ext> ptxas info : Compile time = 530.083 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 244.572 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.074 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 237.413 ms +sage_attention-torch-ext> ptxas info : Compile time = 232.210 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.707 ms +sage_attention-torch-ext> ptxas info : Compile time = 245.509 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 243.286 ms +sage_attention-torch-ext> ptxas info : Compile time = 238.357 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 243.345 ms +sage_attention-torch-ext> ptxas info : Compile time = 237.668 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 234.974 ms +sage_attention-torch-ext> ptxas info : Compile time = 230.888 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.862 ms +sage_attention-torch-ext> ptxas info : Compile time = 243.525 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 241.827 ms +sage_attention-torch-ext> ptxas info : Compile time = 237.689 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.864 ms +sage_attention-torch-ext> ptxas info : Compile time = 255.042 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.610 ms +sage_attention-torch-ext> ptxas info : Compile time = 246.985 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 270.959 ms +sage_attention-torch-ext> ptxas info : Compile time = 262.963 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 260.788 ms +sage_attention-torch-ext> ptxas info : Compile time = 254.173 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 262.670 ms +sage_attention-torch-ext> ptxas info : Compile time = 254.833 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 254.072 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.697 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.637 ms +sage_attention-torch-ext> ptxas info : Compile time = 262.415 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 260.484 ms +sage_attention-torch-ext> ptxas info : Compile time = 254.606 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 479.896 ms +sage_attention-torch-ext> ptxas info : Compile time = 470.452 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 476.006 ms +sage_attention-torch-ext> ptxas info : Compile time = 467.379 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 489.100 ms +sage_attention-torch-ext> ptxas info : Compile time = 478.690 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 495.375 ms +sage_attention-torch-ext> ptxas info : Compile time = 481.126 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 472.317 ms +sage_attention-torch-ext> ptxas info : Compile time = 461.883 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 474.508 ms +sage_attention-torch-ext> ptxas info : Compile time = 463.891 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.359 ms +sage_attention-torch-ext> ptxas info : Compile time = 462.930 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 473.822 ms +sage_attention-torch-ext> ptxas info : Compile time = 462.210 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 488.158 ms +sage_attention-torch-ext> ptxas info : Compile time = 477.580 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 486.883 ms +sage_attention-torch-ext> ptxas info : Compile time = 479.471 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 500.040 ms +sage_attention-torch-ext> ptxas info : Compile time = 488.864 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 501.215 ms +sage_attention-torch-ext> ptxas info : Compile time = 489.588 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 487.436 ms +sage_attention-torch-ext> ptxas info : Compile time = 476.875 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 484.705 ms +sage_attention-torch-ext> ptxas info : Compile time = 478.730 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 493.841 ms +sage_attention-torch-ext> ptxas info : Compile time = 488.466 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 498.617 ms +sage_attention-torch-ext> ptxas info : Compile time = 486.969 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.558 ms +sage_attention-torch-ext> ptxas info : Compile time = 241.338 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.318 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.900 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.831 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.556 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 252.628 ms +sage_attention-torch-ext> ptxas info : Compile time = 246.948 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.847 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.532 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.382 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.661 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 256.103 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.878 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 258.114 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.969 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.177 ms +sage_attention-torch-ext> ptxas info : Compile time = 259.093 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.103 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.858 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 274.647 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.126 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 274.547 ms +sage_attention-torch-ext> ptxas info : Compile time = 266.400 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 266.275 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.847 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.226 ms +sage_attention-torch-ext> ptxas info : Compile time = 258.737 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 274.931 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.989 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 272.085 ms -sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> buildPhase completed in 3 minutes 56 seconds +sage_attention-torch-ext> ptxas info : Compile time = 266.591 ms +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 3 minutes 51 seconds sage_attention-torch-ext> Running phase: installPhase sage_attention-torch-ext> install flags: -j21 install sage_attention-torch-ext> [0/1] Install the project... sage_attention-torch-ext> -- Install configuration: "Release" -sage_attention-torch-ext> -- Installing: /nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/_sage_attention_af2d0c0_dirty/_sage_attention_af2d0c0_dirty.abi3.so +sage_attention-torch-ext> -- Installing: /nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/_sage_attention_1369690_dirty/_sage_attention_1369690_dirty.abi3.so sage_attention-torch-ext> Running phase: fixupPhase -sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext -sage_attention-torch-ext> shrinking /nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> checking for references to /build/ in /nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext... -sage_attention-torch-ext> patching script interpreter paths in /nix/store/ki5ldbx0351svgxhqw7y30n8kbi51l55-sage_attention-torch-ext +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext/sage_attention/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/1p5vv3ybc6igpc3xin04i8p0d1hwbmw9-sage_attention-torch-ext sage_attention-torch-ext> Running phase: installCheckPhase sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing sage_attention-torch-ext> Checking of ABI compatibility @@ -12188,1326 +12188,1326 @@ sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 an sage_attention-torch-ext> ✅ No compatibility issues found sage_attention-torch-ext> Checking loading kernel with get_kernel sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention -sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/fused/fused.cu.o +sage_attention-torch-ext> [10/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/fused/fused.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 28 bytes gmem sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 33.288 ms +sage_attention-torch-ext> ptxas info : Compile time = 32.522 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 29.867 ms +sage_attention-torch-ext> ptxas info : Compile time = 29.255 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 25.591 ms +sage_attention-torch-ext> ptxas info : Compile time = 25.043 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 24.254 ms +sage_attention-torch-ext> ptxas info : Compile time = 23.716 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.526 ms +sage_attention-torch-ext> ptxas info : Compile time = 4.458 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.297 ms +sage_attention-torch-ext> ptxas info : Compile time = 4.239 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.232 ms +sage_attention-torch-ext> ptxas info : Compile time = 4.108 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 4.226 ms +sage_attention-torch-ext> ptxas info : Compile time = 4.084 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 4.133 ms +sage_attention-torch-ext> ptxas info : Compile time = 4.022 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 4.130 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.975 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 3.458 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.378 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers -sage_attention-torch-ext> ptxas info : Compile time = 3.439 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.308 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.601 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.340 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.550 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.287 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.582 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.362 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.496 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.341 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.461 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.235 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.480 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.189 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.457 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.239 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.416 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.195 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 12.196 ms +sage_attention-torch-ext> ptxas info : Compile time = 11.881 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.650 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.557 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.684 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.548 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.661 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.480 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 11.788 ms +sage_attention-torch-ext> ptxas info : Compile time = 11.566 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.503 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.361 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.511 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.325 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 29 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.497 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.314 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 10.839 ms +sage_attention-torch-ext> ptxas info : Compile time = 10.592 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.546 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.413 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 8.139 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.013 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.510 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.445 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 10.488 ms +sage_attention-torch-ext> ptxas info : Compile time = 10.304 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.475 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.403 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.415 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.327 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.485 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.359 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 11.177 ms +sage_attention-torch-ext> ptxas info : Compile time = 10.949 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.894 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.753 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.915 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.763 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.774 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.591 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 11.512 ms +sage_attention-torch-ext> ptxas info : Compile time = 11.353 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.750 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.785 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.763 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.801 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_90a' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 28 registers, used 1 barriers, 132 bytes smem -sage_attention-torch-ext> ptxas info : Compile time = 7.726 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.682 ms sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 22.671 ms +sage_attention-torch-ext> ptxas info : Compile time = 22.378 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 21.188 ms +sage_attention-torch-ext> ptxas info : Compile time = 20.630 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 16.891 ms +sage_attention-torch-ext> ptxas info : Compile time = 16.377 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 16.799 ms +sage_attention-torch-ext> ptxas info : Compile time = 16.501 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.351 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.329 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.234 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.129 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.166 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.032 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.182 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.049 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.075 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.007 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.058 ms +sage_attention-torch-ext> ptxas info : Compile time = 2.940 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.681 ms +sage_attention-torch-ext> ptxas info : Compile time = 2.533 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 20 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.616 ms +sage_attention-torch-ext> ptxas info : Compile time = 2.533 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.217 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.934 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.133 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.860 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.144 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.864 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.120 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.846 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.172 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.896 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.226 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.923 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.236 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.959 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.177 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.899 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.098 ms +sage_attention-torch-ext> ptxas info : Compile time = 10.645 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.254 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.823 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.282 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.817 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.305 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.811 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.128 ms +sage_attention-torch-ext> ptxas info : Compile time = 10.561 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.358 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.901 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.256 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.871 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.287 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.872 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.295 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.902 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.231 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.913 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.179 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.901 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.187 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.867 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 9.794 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.345 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.276 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.864 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.250 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.906 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.157 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.868 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.079 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.702 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.330 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.975 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.326 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.009 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.805 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.478 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 32 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.283 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.901 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.335 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.079 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.300 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.991 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.309 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.004 ms sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 30.539 ms +sage_attention-torch-ext> ptxas info : Compile time = 29.801 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb1E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 39 registers, used 1 barriers, 392 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 28.640 ms +sage_attention-torch-ext> ptxas info : Compile time = 27.683 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E13__nv_bfloat16EvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 22.592 ms +sage_attention-torch-ext> ptxas info : Compile time = 21.867 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15MeanScaleKernelILj64ELb0E6__halfEvPT1_PaPfS4_fjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 260 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 22.658 ms +sage_attention-torch-ext> ptxas info : Compile time = 21.890 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.208 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.132 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E13__nv_bfloat16EvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.167 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.062 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj128ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 21 registers, used 1 barriers, 32768 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.192 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.064 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z25TransposePadPermuteKernelILj64ELj64ELb1E6__halfEvPT2_S2_jjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 19 registers, used 1 barriers, 16384 bytes smem, 396 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.160 ms +sage_attention-torch-ext> ptxas info : Compile time = 3.044 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.087 ms +sage_attention-torch-ext> ptxas info : Compile time = 2.993 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E13__nv_bfloat16EvPT2_S2_P6__halfjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 16 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 3.044 ms +sage_attention-torch-ext> ptxas info : Compile time = 2.959 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj128ELj64ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.654 ms +sage_attention-torch-ext> ptxas info : Compile time = 2.543 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z13SubMeanKernelILj64ELj128ELj1E6__halfEvPT2_S2_PS0_jjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 18 registers, used 0 barriers, 412 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 2.636 ms +sage_attention-torch-ext> ptxas info : Compile time = 2.561 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.138 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.849 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.063 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.823 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.123 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.873 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.089 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.815 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.168 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.938 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj32ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.066 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.858 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.157 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.917 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj16ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.096 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.860 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 38 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.879 ms +sage_attention-torch-ext> ptxas info : Compile time = 10.552 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.060 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.813 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 25 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.066 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.812 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.052 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.778 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 40 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.882 ms +sage_attention-torch-ext> ptxas info : Compile time = 10.568 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.184 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.866 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.184 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.927 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb1E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 8.697 ms +sage_attention-torch-ext> ptxas info : Compile time = 8.427 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 9.729 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.414 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.081 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.852 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.101 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.880 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.084 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.865 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 9.615 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.326 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.109 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.880 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.158 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.919 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb0ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.146 ms +sage_attention-torch-ext> ptxas info : Compile time = 6.902 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 10.199 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.837 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.248 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.024 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.267 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.033 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E13__nv_bfloat16EvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.713 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.505 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj128ELj2ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 33 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 11.405 ms +sage_attention-torch-ext> ptxas info : Compile time = 9.974 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj128ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.241 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.021 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj128ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 27 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.283 ms +sage_attention-torch-ext> ptxas info : Compile time = 7.030 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj' for 'sm_89' sage_attention-torch-ext> ptxas info : Function properties for _Z15QuantInt8KernelILj64ELj64ELj1ELb1ELb0E6__halfEvPT4_S2_PaPffjjjjjjjjjjj sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 26 registers, used 1 barriers, 132 bytes smem, 432 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 7.236 ms -sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_af2d0c0_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o +sage_attention-torch-ext> ptxas info : Compile time = 7.007 ms +sage_attention-torch-ext> [11/12] Building CUDA object CMakeFiles/_sage_attention_1369690_dirty.dir/sage_attention/qattn/qk_int_sv_f16_cuda_sm80.cu.o sage_attention-torch-ext> nvcc warning : incompatible redefinition for option 'threads', the last value of this option was used sage_attention-torch-ext> ptxas info : 28 bytes gmem, 224 bytes cmem[4] sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 571.316 ms +sage_attention-torch-ext> ptxas info : Compile time = 552.683 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 248 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 536.617 ms +sage_attention-torch-ext> ptxas info : Compile time = 523.194 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 571.602 ms +sage_attention-torch-ext> ptxas info : Compile time = 549.323 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 552.467 ms +sage_attention-torch-ext> ptxas info : Compile time = 525.347 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 483.713 ms +sage_attention-torch-ext> ptxas info : Compile time = 466.332 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 463.155 ms +sage_attention-torch-ext> ptxas info : Compile time = 448.298 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 483.172 ms +sage_attention-torch-ext> ptxas info : Compile time = 470.979 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 458.046 ms +sage_attention-torch-ext> ptxas info : Compile time = 446.092 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 500.637 ms +sage_attention-torch-ext> ptxas info : Compile time = 482.331 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 476.593 ms +sage_attention-torch-ext> ptxas info : Compile time = 461.838 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 583.442 ms +sage_attention-torch-ext> ptxas info : Compile time = 565.845 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 557.485 ms +sage_attention-torch-ext> ptxas info : Compile time = 545.442 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 501.026 ms +sage_attention-torch-ext> ptxas info : Compile time = 487.579 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.652 ms +sage_attention-torch-ext> ptxas info : Compile time = 461.041 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 509.803 ms +sage_attention-torch-ext> ptxas info : Compile time = 492.527 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 481.269 ms +sage_attention-torch-ext> ptxas info : Compile time = 467.901 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 227 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 240.480 ms +sage_attention-torch-ext> ptxas info : Compile time = 234.038 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 217 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 228.315 ms +sage_attention-torch-ext> ptxas info : Compile time = 222.115 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 255.751 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.307 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 247.640 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.711 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 248.706 ms +sage_attention-torch-ext> ptxas info : Compile time = 242.804 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 247.183 ms +sage_attention-torch-ext> ptxas info : Compile time = 240.849 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.317 ms +sage_attention-torch-ext> ptxas info : Compile time = 255.302 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 202 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 253.588 ms +sage_attention-torch-ext> ptxas info : Compile time = 245.459 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.679 ms +sage_attention-torch-ext> ptxas info : Compile time = 256.108 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 256.168 ms +sage_attention-torch-ext> ptxas info : Compile time = 247.772 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 284.953 ms +sage_attention-torch-ext> ptxas info : Compile time = 275.405 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 275.347 ms +sage_attention-torch-ext> ptxas info : Compile time = 266.197 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 274.944 ms +sage_attention-torch-ext> ptxas info : Compile time = 267.761 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 258.335 ms +sage_attention-torch-ext> ptxas info : Compile time = 252.322 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 282.089 ms +sage_attention-torch-ext> ptxas info : Compile time = 274.518 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb1EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 194 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 273.193 ms +sage_attention-torch-ext> ptxas info : Compile time = 265.520 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 203.289 ms +sage_attention-torch-ext> ptxas info : Compile time = 198.145 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 185 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 203.666 ms +sage_attention-torch-ext> ptxas info : Compile time = 198.634 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 212.717 ms +sage_attention-torch-ext> ptxas info : Compile time = 208.061 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 246 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 214.296 ms +sage_attention-torch-ext> ptxas info : Compile time = 209.647 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 216.050 ms +sage_attention-torch-ext> ptxas info : Compile time = 210.576 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 176 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 215.456 ms +sage_attention-torch-ext> ptxas info : Compile time = 210.175 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 214.066 ms +sage_attention-torch-ext> ptxas info : Compile time = 209.174 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 184 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 212.810 ms +sage_attention-torch-ext> ptxas info : Compile time = 208.581 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 220.633 ms +sage_attention-torch-ext> ptxas info : Compile time = 214.345 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 180 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 220.663 ms +sage_attention-torch-ext> ptxas info : Compile time = 215.071 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 225.886 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.668 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 226.663 ms +sage_attention-torch-ext> ptxas info : Compile time = 219.540 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 220.512 ms +sage_attention-torch-ext> ptxas info : Compile time = 213.510 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 177 registers, used 1 barriers, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 220.524 ms +sage_attention-torch-ext> ptxas info : Compile time = 213.305 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 224.866 ms +sage_attention-torch-ext> ptxas info : Compile time = 217.553 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj16ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 178 registers, used 1 barriers, 16 bytes cumulative stack size, 480 bytes cmem[0] -sage_attention-torch-ext> ptxas info : Compile time = 225.433 ms +sage_attention-torch-ext> ptxas info : Compile time = 218.382 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 264.182 ms +sage_attention-torch-ext> ptxas info : Compile time = 255.931 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 263.453 ms +sage_attention-torch-ext> ptxas info : Compile time = 255.049 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.958 ms +sage_attention-torch-ext> ptxas info : Compile time = 262.130 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 249 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.032 ms +sage_attention-torch-ext> ptxas info : Compile time = 262.696 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 244.684 ms +sage_attention-torch-ext> ptxas info : Compile time = 237.992 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 244.518 ms +sage_attention-torch-ext> ptxas info : Compile time = 237.915 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.924 ms +sage_attention-torch-ext> ptxas info : Compile time = 262.410 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.846 ms +sage_attention-torch-ext> ptxas info : Compile time = 262.170 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 282.806 ms +sage_attention-torch-ext> ptxas info : Compile time = 275.393 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 282.493 ms +sage_attention-torch-ext> ptxas info : Compile time = 274.460 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 289.549 ms +sage_attention-torch-ext> ptxas info : Compile time = 281.105 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 289.213 ms +sage_attention-torch-ext> ptxas info : Compile time = 281.974 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 281.922 ms +sage_attention-torch-ext> ptxas info : Compile time = 275.665 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 281.915 ms +sage_attention-torch-ext> ptxas info : Compile time = 273.540 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 289.308 ms +sage_attention-torch-ext> ptxas info : Compile time = 280.176 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb1E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 288.872 ms +sage_attention-torch-ext> ptxas info : Compile time = 280.231 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 461.987 ms +sage_attention-torch-ext> ptxas info : Compile time = 446.182 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 253 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 443.316 ms +sage_attention-torch-ext> ptxas info : Compile time = 426.698 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 545.054 ms +sage_attention-torch-ext> ptxas info : Compile time = 529.447 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 444.175 ms +sage_attention-torch-ext> ptxas info : Compile time = 431.382 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 456.380 ms +sage_attention-torch-ext> ptxas info : Compile time = 443.656 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 432.964 ms +sage_attention-torch-ext> ptxas info : Compile time = 423.599 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 525.958 ms +sage_attention-torch-ext> ptxas info : Compile time = 508.858 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 500.678 ms +sage_attention-torch-ext> ptxas info : Compile time = 482.621 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 475.069 ms +sage_attention-torch-ext> ptxas info : Compile time = 457.277 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 450.586 ms +sage_attention-torch-ext> ptxas info : Compile time = 435.240 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 480.912 ms +sage_attention-torch-ext> ptxas info : Compile time = 465.834 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 254 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 458.569 ms +sage_attention-torch-ext> ptxas info : Compile time = 443.837 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 470.436 ms +sage_attention-torch-ext> ptxas info : Compile time = 455.826 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 250 registers, used 1 barriers, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 448.004 ms +sage_attention-torch-ext> ptxas info : Compile time = 434.695 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 481.605 ms +sage_attention-torch-ext> ptxas info : Compile time = 466.857 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 252 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 457.319 ms +sage_attention-torch-ext> ptxas info : Compile time = 444.030 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 201 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 242.204 ms +sage_attention-torch-ext> ptxas info : Compile time = 234.948 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 232.154 ms +sage_attention-torch-ext> ptxas info : Compile time = 224.899 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 205 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 246.571 ms +sage_attention-torch-ext> ptxas info : Compile time = 238.904 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 207 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 238.084 ms +sage_attention-torch-ext> ptxas info : Compile time = 230.283 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 239.558 ms +sage_attention-torch-ext> ptxas info : Compile time = 231.802 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 204 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 230.949 ms +sage_attention-torch-ext> ptxas info : Compile time = 223.749 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.820 ms +sage_attention-torch-ext> ptxas info : Compile time = 238.670 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 237.532 ms +sage_attention-torch-ext> ptxas info : Compile time = 230.398 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 257.107 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.007 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 249.043 ms +sage_attention-torch-ext> ptxas info : Compile time = 242.195 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 265.070 ms +sage_attention-torch-ext> ptxas info : Compile time = 256.917 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 199 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 257.008 ms +sage_attention-torch-ext> ptxas info : Compile time = 248.321 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 258.839 ms +sage_attention-torch-ext> ptxas info : Compile time = 249.375 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 198 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.896 ms +sage_attention-torch-ext> ptxas info : Compile time = 241.410 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS6_PS2_PT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 266.332 ms +sage_attention-torch-ext> ptxas info : Compile time = 257.990 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2E6__halfLb0ES2_L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 200 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 257.645 ms +sage_attention-torch-ext> ptxas info : Compile time = 248.705 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 469.718 ms +sage_attention-torch-ext> ptxas info : Compile time = 455.769 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 468.044 ms +sage_attention-torch-ext> ptxas info : Compile time = 454.613 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 486.454 ms +sage_attention-torch-ext> ptxas info : Compile time = 470.853 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 32 bytes spill stores, 40 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 486.216 ms +sage_attention-torch-ext> ptxas info : Compile time = 471.336 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 471.571 ms +sage_attention-torch-ext> ptxas info : Compile time = 457.470 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 472.500 ms +sage_attention-torch-ext> ptxas info : Compile time = 458.146 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 473.836 ms +sage_attention-torch-ext> ptxas info : Compile time = 461.814 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 474.836 ms +sage_attention-torch-ext> ptxas info : Compile time = 459.742 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 490.383 ms +sage_attention-torch-ext> ptxas info : Compile time = 476.324 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 64 bytes stack frame, 60 bytes spill stores, 68 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 64 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 490.178 ms +sage_attention-torch-ext> ptxas info : Compile time = 475.340 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 499.579 ms +sage_attention-torch-ext> ptxas info : Compile time = 484.900 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 112 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 112 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 500.229 ms +sage_attention-torch-ext> ptxas info : Compile time = 485.926 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 489.701 ms +sage_attention-torch-ext> ptxas info : Compile time = 474.887 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 56 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 489.876 ms +sage_attention-torch-ext> ptxas info : Compile time = 474.075 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 498.759 ms +sage_attention-torch-ext> ptxas info : Compile time = 483.415 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj128EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 96 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 96 bytes cumulative stack size, 480 bytes cmem[0], 16 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 497.826 ms +sage_attention-torch-ext> ptxas info : Compile time = 482.843 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.600 ms +sage_attention-torch-ext> ptxas info : Compile time = 238.801 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 245.136 ms +sage_attention-torch-ext> ptxas info : Compile time = 236.384 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 251.077 ms +sage_attention-torch-ext> ptxas info : Compile time = 241.917 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 250.828 ms +sage_attention-torch-ext> ptxas info : Compile time = 242.693 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 242.714 ms +sage_attention-torch-ext> ptxas info : Compile time = 235.138 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 242.881 ms +sage_attention-torch-ext> ptxas info : Compile time = 235.253 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 249.941 ms +sage_attention-torch-ext> ptxas info : Compile time = 241.941 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode0ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 249.919 ms +sage_attention-torch-ext> ptxas info : Compile time = 242.014 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 262.039 ms +sage_attention-torch-ext> ptxas info : Compile time = 254.167 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 261.386 ms +sage_attention-torch-ext> ptxas info : Compile time = 253.268 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 270.534 ms +sage_attention-torch-ext> ptxas info : Compile time = 260.712 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity3ELS1_3EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.921 ms +sage_attention-torch-ext> ptxas info : Compile time = 259.851 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 262.095 ms +sage_attention-torch-ext> ptxas info : Compile time = 253.822 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb0ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 261.711 ms +sage_attention-torch-ext> ptxas info : Compile time = 253.470 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E13__nv_bfloat16L11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_P6__halfPT9_PfSA_SA_S9_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 269.020 ms +sage_attention-torch-ext> ptxas info : Compile time = 260.374 ms sage_attention-torch-ext> ptxas info : Compiling entry function '_Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf' for 'sm_80' sage_attention-torch-ext> ptxas info : Function properties for _Z25qk_int_sv_f16_attn_kernelILj128ELj64ELj32ELj64ELj64EL8DataType1EL16QuantGranularity2ELS1_2EfLb0E6__halfL11ComputeUnit0EL8MaskMode1ELb1ELb0EEvPaS5_PS2_PT9_PfS9_S9_S8_jjjjjjjjjjjjjjjf sage_attention-torch-ext> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads sage_attention-torch-ext> ptxas info : Used 255 registers, used 1 barriers, 32 bytes cumulative stack size, 480 bytes cmem[0], 8 bytes cmem[2] -sage_attention-torch-ext> ptxas info : Compile time = 268.515 ms -sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> buildPhase completed in 5 minutes 6 seconds +sage_attention-torch-ext> ptxas info : Compile time = 259.803 ms +sage_attention-torch-ext> [12/12] Linking CXX shared module _sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> buildPhase completed in 5 minutes 8 seconds sage_attention-torch-ext> Running phase: installPhase sage_attention-torch-ext> install flags: -j21 install sage_attention-torch-ext> [0/1] Install the project... sage_attention-torch-ext> -- Install configuration: "Release" -sage_attention-torch-ext> -- Installing: /nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/_sage_attention_af2d0c0_dirty/_sage_attention_af2d0c0_dirty.abi3.so +sage_attention-torch-ext> -- Installing: /nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/_sage_attention_1369690_dirty/_sage_attention_1369690_dirty.abi3.so sage_attention-torch-ext> Running phase: fixupPhase -sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext -sage_attention-torch-ext> shrinking /nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext/sage_attention/_sage_attention_af2d0c0_dirty.abi3.so -sage_attention-torch-ext> checking for references to /build/ in /nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext... -sage_attention-torch-ext> patching script interpreter paths in /nix/store/zrx3aflqjvr10nv91lgyfynpa623nsha-sage_attention-torch-ext +sage_attention-torch-ext> shrinking RPATHs of ELF executables and libraries in /nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext +sage_attention-torch-ext> shrinking /nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext/sage_attention/_sage_attention_1369690_dirty.abi3.so +sage_attention-torch-ext> checking for references to /build/ in /nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext... +sage_attention-torch-ext> patching script interpreter paths in /nix/store/n0xpmbwrclkfxs3sc50hr91s0180aqim-sage_attention-torch-ext sage_attention-torch-ext> Running phase: installCheckPhase sage_attention-torch-ext> no Makefile or custom installCheckPhase, doing nothing sage_attention-torch-ext> Checking of ABI compatibility @@ -13515,5 +13515,5 @@ sage_attention-torch-ext> 🐍 Checking for compatibility with manylinux_2_28 an sage_attention-torch-ext> ✅ No compatibility issues found sage_attention-torch-ext> Checking loading kernel with get_kernel sage_attention-torch-ext> Check whether the kernel can be loaded with get-kernel: sage_attention -building '/nix/store/zcfc2w942q3a6lpp77cmz64zdis9i1dz-torch-ext-bundle.drv'... -building '/nix/store/q2d20wl8cfvw82mp757i59cvq8z9wmpv-build-and-copy.drv'... +building '/nix/store/ll29z68nd2l37pyv7ihc5sw3qv1wffjp-torch-ext-bundle.drv'... +building '/nix/store/dyhilzcxbx6rq6fhn6kpnhig1jf5rv4c-build-and-copy.drv'... diff --git a/tests/test_sage_attention.py b/tests/test_sage_attention.py deleted file mode 100644 index ef10d3f82e384a0920fb1a86e4cc9c195ebc1185..0000000000000000000000000000000000000000 --- a/tests/test_sage_attention.py +++ /dev/null @@ -1,65 +0,0 @@ -import sage_attention -import torch - -print(dir(sage_attention)) - -# Skip tests gracefully if CUDA is unavailable (e.g., no driver/GPU present) -if not torch.cuda.is_available(): - print("CUDA is not available; skipping SageAttention tests.") - raise SystemExit(0) - - -def test_per_block_int8(): - q = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda") - k = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda") - km = torch.randn(1, 1024, 128, dtype=torch.float16, device="cuda") - q_int8, q_scale, k_int8, k_scale = sage_attention.per_block_int8(q, k, km) - print(q_int8.shape, q_scale.shape, k_int8.shape, k_scale.shape) - - -# test_per_block_int8() - - -def test_per_channel_fp8(): - v = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda") - v_fp8, v_scale, vm = sage_attention.per_channel_fp8( - v, tensor_layout="HND", smooth_v=True - ) - print(v_fp8.shape, v_scale.shape, vm.shape) - - -def test_sageattn(): - # The error is about the expected shape of query_scale, which is derived from the kernel's block sizes. - # Let's use a shape for q, k, v that matches the kernel's expectations for scale shape. - # For HND: (batch, nheads, seqlen, head_dim) - # Let's use seqlen = 128, which is a multiple of CTA_Q=128, WARP_Q=16. - # This will make div_ceil(qo_len, CTA_Q) = 1, (CTA_Q / WARP_Q) = 8, so scale shape = (1, 1024, 8) - q = torch.randn(1, 1024, 128, 128, dtype=torch.float16, device="cuda") - k = torch.randn(1, 1024, 128, 128, dtype=torch.float16, device="cuda") - v = torch.randn(1, 1024, 128, 128, dtype=torch.float16, device="cuda") - # Compare SageAttention to standard attention - # o_sage = sage_attention.sageattn( - # q, k, v, tensor_layout="HND", is_causal=False, return_lse=False - # ) - o_sage = sage_attention.sageattn_qk_int8_pv_fp8_cuda( - q, k, v, - tensor_layout="HND", - is_causal=False, - qk_quant_gran="per_warp", # switch from per_warp - pv_accum_dtype="fp32+fp32", # required for sm90 - return_lse=False, - ) - # Force sync so any async kernel failure surfaces here (not on the next cuBLAS call) - - # Standard attention for comparison - # q, k, v: (batch, nheads, seqlen, head_dim) - attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (q.shape[-1] ** 0.5) - attn_probs = torch.softmax(attn_scores, dim=-1) - o_ref = torch.matmul(attn_probs, v) - - print("SageAttention output shape:", o_sage.shape) - print("Standard attention output shape:", o_ref.shape) - print("Max abs diff:", (o_sage.float() - o_ref).abs().max().item()) - - -test_sageattn() diff --git a/torch-ext/sage_attention/__init__.py b/torch-ext/sage_attention/__init__.py index 5df69e92664edfbd0820d5126792e41e23a72762..43b5e35a4154627e6457993372bd46b2cf78b89a 100644 --- a/torch-ext/sage_attention/__init__.py +++ b/torch-ext/sage_attention/__init__.py @@ -1,5 +1,5 @@ from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8 -from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda +from .core import sageattn __all__ = [ @@ -8,5 +8,4 @@ __all__ = [ "sub_mean", "per_channel_fp8", "sageattn", - "sageattn_qk_int8_pv_fp8_cuda", ] \ No newline at end of file diff --git a/torch-ext/sage_attention/core.py b/torch-ext/sage_attention/core.py index dc44a8e1ee17a5c5c65da5adda6faf9228cca55e..590e664e325d79068a56ee796a548f92ead07442 100644 --- a/torch-ext/sage_attention/core.py +++ b/torch-ext/sage_attention/core.py @@ -363,7 +363,7 @@ def sageattn_qk_int8_pv_fp16_cuda( if pv_accum_dtype == "fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn( + lse = ops.qk_int8_sv_f16_accum_f32_attn( q_int8, k_int8, v, @@ -379,7 +379,7 @@ def sageattn_qk_int8_pv_fp16_cuda( elif pv_accum_dtype == "fp16": if smooth_v: smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( + lse = ops.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn( q_int8, k_int8, smoothed_v, @@ -395,7 +395,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) else: v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn( + lse = ops.qk_int8_sv_f16_accum_f16_attn( q_int8, k_int8, v, @@ -410,7 +410,7 @@ def sageattn_qk_int8_pv_fp16_cuda( ) elif pv_accum_dtype == "fp16+fp32": v = v.to(torch.float16) - lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf( + lse = ops.qk_int8_sv_f16_accum_f16_attn_inst_buf( q_int8, k_int8, v, @@ -941,20 +941,6 @@ def sageattn_qk_int8_pv_fp8_cuda_sm90( _return_lse, ) elif pv_accum_dtype == "fp32+fp32": - print( - "qint8", - q_int8.shape, - "qscale", - q_scale.shape, - "kint8", - k_int8.shape, - "kscale", - k_scale.shape, - "vfp8", - v_fp8.shape, - "vscale", - v_scale.shape, - ) lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90( q_int8, k_int8, diff --git a/torch-ext/sage_attention/layers.py b/torch-ext/sage_attention/layers.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000