fbgemm_remove_unused.patch 15 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 2244ea6f..96265a48 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -94,14 +94,14 @@ endif()
 # Build Experimental Modules
 ################################################################################

-if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
-  # TODO: Figure out NCCL/RCCL integration with ROCm
-  add_subdirectory(experimental/example)
-endif()
-
-if(NOT FBGEMM_CPU_ONLY)
-  add_subdirectory(experimental/gemm)
-endif()
+# if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
+#   # TODO: Figure out NCCL/RCCL integration with ROCm
+#   add_subdirectory(experimental/example)
+# endif()
+
+# if(NOT FBGEMM_CPU_ONLY)
+#   add_subdirectory(experimental/gemm)
+# endif()

 if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
   # CUTLASS currently doesn't build on ROCm and CK hasnt yet been added:
diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake
index c56773fe..0c0d349e 100644
--- a/fbgemm_gpu/FbgemmGpu.cmake
+++ b/fbgemm_gpu/FbgemmGpu.cmake
@@ -446,53 +446,55 @@ set_source_files_properties(${fbgemm_sources}
 ################################################################################

 set(fbgemm_gpu_sources_static_cpu
-    codegen/training/forward/embedding_forward_split_cpu.cpp
-    codegen/inference/embedding_forward_quantized_host_cpu.cpp
-    codegen/training/backward/embedding_backward_dense_host_cpu.cpp
-    codegen/utils/embedding_bounds_check_host_cpu.cpp
-    src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
-    src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp
-    src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp
-    src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
-    src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
-    src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
-    src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
-    src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
-    src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
-    src/input_combine_ops/input_combine_cpu.cpp
-    src/layout_transform_ops/layout_transform_ops_cpu.cpp
+    # codegen/training/forward/embedding_forward_split_cpu.cpp
+    # codegen/inference/embedding_forward_quantized_host_cpu.cpp
+    # codegen/training/backward/embedding_backward_dense_host_cpu.cpp
+    # codegen/utils/embedding_bounds_check_host_cpu.cpp
+    # src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
+    # src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp
+    # src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp
+    # src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
+    # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
+    # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
+    # src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
+    # src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
+    # src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+    # src/input_combine_ops/input_combine_cpu.cpp
+    # src/layout_transform_ops/layout_transform_ops_cpu.cpp
     src/quantize_ops/quantize_ops_cpu.cpp
     src/quantize_ops/quantize_ops_meta.cpp
-    src/sparse_ops/sparse_ops_cpu.cpp
-    src/sparse_ops/sparse_ops_meta.cpp
-    src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
-    src/split_embeddings_cache/linearize_cache_indices.cpp
-    src/split_embeddings_cache/lfu_cache_populate_byte.cpp
-    src/split_embeddings_cache/lru_cache_populate_byte.cpp
-    src/split_embeddings_cache/lxu_cache.cpp
-    src/split_embeddings_cache/split_embeddings_cache_ops.cpp
-    codegen/training/index_select/batch_index_select_dim0_ops.cpp
-    codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp)
+    # src/sparse_ops/sparse_ops_cpu.cpp
+    # src/sparse_ops/sparse_ops_meta.cpp
+    # src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
+    # src/split_embeddings_cache/linearize_cache_indices.cpp
+    # src/split_embeddings_cache/lfu_cache_populate_byte.cpp
+    # src/split_embeddings_cache/lru_cache_populate_byte.cpp
+    # src/split_embeddings_cache/lxu_cache.cpp
+    # src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+    # codegen/training/index_select/batch_index_select_dim0_ops.cpp
+    # codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp)
+)

 if(NOT FBGEMM_CPU_ONLY)
   list(APPEND fbgemm_gpu_sources_static_cpu
-    codegen/inference/embedding_forward_quantized_host.cpp
-    codegen/utils/embedding_bounds_check_host.cpp
-    src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp
-    src/layout_transform_ops/layout_transform_ops_gpu.cpp
-    src/memory_utils/memory_utils.cpp
-    src/memory_utils/memory_utils_ops.cpp
-    src/memory_utils/memory_utils_ops_cpu.cpp
-    src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
-    src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
+    # codegen/inference/embedding_forward_quantized_host.cpp
+    # codegen/utils/embedding_bounds_check_host.cpp
+    # src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp
+    # src/layout_transform_ops/layout_transform_ops_gpu.cpp
+    # src/memory_utils/memory_utils.cpp
+    # src/memory_utils/memory_utils_ops.cpp
+    # src/memory_utils/memory_utils_ops_cpu.cpp
+    # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
+    # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
     src/quantize_ops/quantize_ops_gpu.cpp
-    src/sparse_ops/sparse_ops_gpu.cpp
-    src/split_embeddings_utils/split_embeddings_utils.cpp
-    src/split_embeddings_cache/split_embeddings_cache_ops.cu
-    src/metric_ops/metric_ops_host.cpp
-    src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
-    src/input_combine_ops/input_combine_gpu.cpp
-    codegen/training/index_select/batch_index_select_dim0_host.cpp)
+    # src/sparse_ops/sparse_ops_gpu.cpp
+    # src/split_embeddings_utils/split_embeddings_utils.cpp
+    # src/split_embeddings_cache/split_embeddings_cache_ops.cu
+    # src/metric_ops/metric_ops_host.cpp
+    # src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+    # src/input_combine_ops/input_combine_gpu.cpp
+    # codegen/training/index_select/batch_index_select_dim0_host.cpp)
+  )

   if(NVML_LIB_PATH OR USE_ROCM)
     message(STATUS "Adding merge_pooled_embeddings sources")
@@ -516,36 +518,36 @@ endif()

 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_static_gpu
-      codegen/utils/embedding_bounds_check.cu
-      codegen/inference/embedding_forward_quantized_split_lookup.cu
-      src/embedding_inplace_ops/embedding_inplace_update.cu
-      src/histogram_binning_calibration_ops.cu
-      src/input_combine_ops/input_combine.cu
-      src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu
-      src/memory_utils/memory_utils.cu
-      src/memory_utils/memory_utils_ops.cu
-      src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
-      src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
-      src/jagged_tensor_ops/dense_to_jagged_forward.cu
-      src/jagged_tensor_ops/jagged_dense_bmm_forward.cu
-      src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
-      src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
-      src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu
-      src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
-      src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
-      src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu
-      src/jagged_tensor_ops/jagged_softmax_backward.cu
-      src/jagged_tensor_ops/jagged_softmax_forward.cu
-      src/jagged_tensor_ops/jagged_tensor_ops.cu
-      src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
-      src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
-      src/jagged_tensor_ops/jagged_unique_indices.cu
-      src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
-      src/layout_transform_ops/layout_transform_ops.cu
-      src/metric_ops/metric_ops.cu
-      src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
-      src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
-      src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu
+      # codegen/utils/embedding_bounds_check.cu
+      # codegen/inference/embedding_forward_quantized_split_lookup.cu
+      # src/embedding_inplace_ops/embedding_inplace_update.cu
+      # src/histogram_binning_calibration_ops.cu
+      # src/input_combine_ops/input_combine.cu
+      # src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu
+      # src/memory_utils/memory_utils.cu
+      # src/memory_utils/memory_utils_ops.cu
+      # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
+      # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
+      # src/jagged_tensor_ops/dense_to_jagged_forward.cu
+      # src/jagged_tensor_ops/jagged_dense_bmm_forward.cu
+      # src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
+      # src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
+      # src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu
+      # src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
+      # src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
+      # src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu
+      # src/jagged_tensor_ops/jagged_softmax_backward.cu
+      # src/jagged_tensor_ops/jagged_softmax_forward.cu
+      # src/jagged_tensor_ops/jagged_tensor_ops.cu
+      # src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
+      # src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
+      # src/jagged_tensor_ops/jagged_unique_indices.cu
+      # src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
+      # src/layout_transform_ops/layout_transform_ops.cu
+      # src/metric_ops/metric_ops.cu
+      # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
+      # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
+      # src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu
       src/quantize_ops/quantize_bfloat16.cu
       src/quantize_ops/quantize_fp8_rowwise.cu
       src/quantize_ops/quantize_fused_8bit_rowwise.cu
@@ -554,39 +556,40 @@ if(NOT FBGEMM_CPU_ONLY)
       src/quantize_ops/quantize_msfp.cu
       src/quantize_ops/quantize_padded_fp8_rowwise.cu
       src/quantize_ops/quantize_mx.cu
-      src/sparse_ops/sparse_async_cumsum.cu
-      src/sparse_ops/sparse_block_bucketize_features.cu
-      src/sparse_ops/sparse_bucketize_features.cu
-      src/sparse_ops/sparse_batched_unary_embeddings.cu
-      src/sparse_ops/sparse_compute_frequency_sequence.cu
-      src/sparse_ops/sparse_expand_into_jagged_permute.cu
-      src/sparse_ops/sparse_group_index.cu
-      src/sparse_ops/sparse_index_add.cu
-      src/sparse_ops/sparse_index_select.cu
-      src/sparse_ops/sparse_invert_permute.cu
-      src/sparse_ops/sparse_pack_segments_backward.cu
-      src/sparse_ops/sparse_pack_segments_forward.cu
-      src/sparse_ops/sparse_permute_1d.cu
-      src/sparse_ops/sparse_permute_2d.cu
-      src/sparse_ops/sparse_permute102.cu
-      src/sparse_ops/sparse_permute_embeddings.cu
-      src/sparse_ops/sparse_range.cu
-      src/sparse_ops/sparse_reorder_batched_ad.cu
-      src/sparse_ops/sparse_segment_sum_csr.cu
-      src/sparse_ops/sparse_zipf.cu
-      src/split_embeddings_cache/lfu_cache_find.cu
-      src/split_embeddings_cache/lfu_cache_populate.cu
-      src/split_embeddings_cache/lfu_cache_populate_byte.cu
-      src/split_embeddings_cache/lru_cache_find.cu
-      src/split_embeddings_cache/lru_cache_populate.cu
-      src/split_embeddings_cache/lru_cache_populate_byte.cu
-      src/split_embeddings_cache/lxu_cache.cu
-      src/split_embeddings_cache/linearize_cache_indices.cu
-      src/split_embeddings_cache/reset_weight_momentum.cu
-      src/split_embeddings_utils/generate_vbe_metadata.cu
-      src/split_embeddings_utils/get_infos_metadata.cu
-      src/split_embeddings_utils/radix_sort_pairs.cu
-      src/split_embeddings_utils/transpose_embedding_input.cu)
+      # src/sparse_ops/sparse_async_cumsum.cu
+      # src/sparse_ops/sparse_block_bucketize_features.cu
+      # src/sparse_ops/sparse_bucketize_features.cu
+      # src/sparse_ops/sparse_batched_unary_embeddings.cu
+      # src/sparse_ops/sparse_compute_frequency_sequence.cu
+      # src/sparse_ops/sparse_expand_into_jagged_permute.cu
+      # src/sparse_ops/sparse_group_index.cu
+      # src/sparse_ops/sparse_index_add.cu
+      # src/sparse_ops/sparse_index_select.cu
+      # src/sparse_ops/sparse_invert_permute.cu
+      # src/sparse_ops/sparse_pack_segments_backward.cu
+      # src/sparse_ops/sparse_pack_segments_forward.cu
+      # src/sparse_ops/sparse_permute_1d.cu
+      # src/sparse_ops/sparse_permute_2d.cu
+      # src/sparse_ops/sparse_permute102.cu
+      # src/sparse_ops/sparse_permute_embeddings.cu
+      # src/sparse_ops/sparse_range.cu
+      # src/sparse_ops/sparse_reorder_batched_ad.cu
+      # src/sparse_ops/sparse_segment_sum_csr.cu
+      # src/sparse_ops/sparse_zipf.cu
+      # src/split_embeddings_cache/lfu_cache_find.cu
+      # src/split_embeddings_cache/lfu_cache_populate.cu
+      # src/split_embeddings_cache/lfu_cache_populate_byte.cu
+      # src/split_embeddings_cache/lru_cache_find.cu
+      # src/split_embeddings_cache/lru_cache_populate.cu
+      # src/split_embeddings_cache/lru_cache_populate_byte.cu
+      # src/split_embeddings_cache/lxu_cache.cu
+      # src/split_embeddings_cache/linearize_cache_indices.cu
+      # src/split_embeddings_cache/reset_weight_momentum.cu
+      # src/split_embeddings_utils/generate_vbe_metadata.cu
+      # src/split_embeddings_utils/get_infos_metadata.cu
+      # src/split_embeddings_utils/radix_sort_pairs.cu
+      # src/split_embeddings_utils/transpose_embedding_input.cu)
+  )

   set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
     PROPERTIES COMPILE_OPTIONS
diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
index 01f1d6ab..a6b8d7a8 100644
--- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
+++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
@@ -25,23 +25,24 @@ set(fbgemm_sources_include_directories
   ${THIRDPARTY}/json/include
   ${NCCL_INCLUDE_DIRS})

-set(attention_ops_sources
-    src/attention/attention.cpp
-    src/attention/gqa_attn_splitk.cu)
+# set(attention_ops_sources
+#     src/attention/attention.cpp
+#     src/attention/gqa_attn_splitk.cu)

 set(quantize_ops_sources
     src/quantize/cutlass_extensions.cu
     src/quantize/quantize.cu
     src/quantize/quantize.cpp)

-set(comm_ops_sources
-    src/comm/car.cu
-    src/comm/car.cpp)
+# set(comm_ops_sources
+#     src/comm/car.cu
+#     src/comm/car.cpp)

 set(experimental_gen_ai_cpp_source_files
-    ${attention_ops_sources}
+    # ${attention_ops_sources}
     ${quantize_ops_sources}
-    ${comm_ops_sources})
+    # ${comm_ops_sources}
+)

 set_source_files_properties(${experimental_gen_ai_cpp_source_files}
     PROPERTIES INCLUDE_DIRECTORIES