From 116d0df3aa56076c112eef4c73116ae2778bf788 Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Sat, 26 Oct 2024 11:34:42 +0000
Subject: [PATCH] 2024-10-26 nightly release
 (f606d5cd499f57fa44e0697330689256c8a0b386)

---
 .pyre_configuration                           |  2 +-
 torchrec/datasets/criteo.py                   |  2 +-
 torchrec/datasets/tests/test_criteo.py        |  2 +-
 .../distributed/batched_embedding_kernel.py   | 22 ++++++++++++
 .../distributed/benchmark/benchmark_utils.py  |  3 +-
 .../distributed/keyed_jagged_tensor_pool.py   |  4 +--
 torchrec/distributed/object_pool.py           | 10 +++---
 .../planner/tests/test_partitioners.py        |  4 +++
 torchrec/distributed/shards_wrapper.py        |  4 +--
 torchrec/distributed/tensor_pool.py           |  2 +-
 torchrec/distributed/tests/test_awaitable.py  |  4 +++
 .../distributed/tests/test_embedding_types.py | 13 ++++---
 .../distributed/tests/test_lazy_awaitable.py  |  2 --
 .../train_pipeline/train_pipelines.py         |  1 +
 torchrec/distributed/types.py                 |  4 ++-
 torchrec/distributed/utils.py                 |  6 +++-
 .../inference/inference_legacy/__init__.py    |  2 ++
 torchrec/linter/module_linter.py              |  2 ++
 torchrec/metrics/tests/test_metric_module.py  |  1 +
 torchrec/modules/utils.py                     | 35 ++++++++++++++++---
 torchrec/quant/embedding_modules.py           | 10 +++++-
 21 files changed, 104 insertions(+), 31 deletions(-)

diff --git a/.pyre_configuration b/.pyre_configuration
index c88fbe109..4249f9c02 100644
--- a/.pyre_configuration
+++ b/.pyre_configuration
@@ -13,5 +13,5 @@
     }
   ],
   "strict": true,
-  "version": "0.0.101703592829"
+  "version": "0.0.101729681899"
 }
diff --git a/torchrec/datasets/criteo.py b/torchrec/datasets/criteo.py
index 7252ad474..3ade6ee29 100644
--- a/torchrec/datasets/criteo.py
+++ b/torchrec/datasets/criteo.py
@@ -252,7 +252,7 @@ def row_mapper(row: List[str]) -> Tuple[List[int], List[int], int]:
         # using int64. Numpy will automatically handle dense values >= 2 ** 31.
         dense_np = np.array(dense, dtype=np.int32)
         del dense
-        sparse_np = np.array(sparse, dtype=np.int32)
+        sparse_np = np.array(sparse, dtype=np.int64)
         del sparse
         labels_np = np.array(labels, dtype=np.int32)
         del labels
diff --git a/torchrec/datasets/tests/test_criteo.py b/torchrec/datasets/tests/test_criteo.py
index 504d88460..e402bc734 100644
--- a/torchrec/datasets/tests/test_criteo.py
+++ b/torchrec/datasets/tests/test_criteo.py
@@ -144,7 +144,7 @@ def test_tsv_to_npys(self) -> None:
             self.assertEqual(dense.shape, (num_rows, INT_FEATURE_COUNT))
             self.assertEqual(dense.dtype, np.float32)
             self.assertEqual(sparse.shape, (num_rows, CAT_FEATURE_COUNT))
-            self.assertEqual(sparse.dtype, np.int32)
+            self.assertEqual(sparse.dtype, np.int64)
             self.assertEqual(labels.shape, (num_rows, 1))
             self.assertEqual(labels.dtype, np.int32)
 
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
index 429bb7f67..4e9390451 100644
--- a/torchrec/distributed/batched_embedding_kernel.py
+++ b/torchrec/distributed/batched_embedding_kernel.py
@@ -674,13 +674,24 @@ def __init__(
         self.table_name_to_count: Dict[str, int] = {}
         self._param_per_table: Dict[str, TableBatchedEmbeddingSlice] = {}
 
+        # pyre-fixme[9]: config has type `GroupedEmbeddingConfig`; used as
+        #  `ShardedEmbeddingTable`.
         for idx, config in enumerate(self._config.embedding_tables):
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute `local_rows`.
             self._local_rows.append(config.local_rows)
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute
+            #  `get_weight_init_min`.
             self._weight_init_mins.append(config.get_weight_init_min())
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute
+            #  `get_weight_init_max`.
             self._weight_init_maxs.append(config.get_weight_init_max())
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute
+            #  `num_embeddings`.
             self._num_embeddings.append(config.num_embeddings)
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute `local_cols`.
             self._local_cols.append(config.local_cols)
             self._feature_table_map.extend([idx] * config.num_features())
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute `name`.
             if config.name not in self.table_name_to_count:
                 self.table_name_to_count[config.name] = 0
             self.table_name_to_count[config.name] += 1
@@ -1080,13 +1091,24 @@ def __init__(
         self.table_name_to_count: Dict[str, int] = {}
         self._param_per_table: Dict[str, TableBatchedEmbeddingSlice] = {}
 
+        # pyre-fixme[9]: config has type `GroupedEmbeddingConfig`; used as
+        #  `ShardedEmbeddingTable`.
         for idx, config in enumerate(self._config.embedding_tables):
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute `local_rows`.
             self._local_rows.append(config.local_rows)
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute
+            #  `get_weight_init_min`.
             self._weight_init_mins.append(config.get_weight_init_min())
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute
+            #  `get_weight_init_max`.
             self._weight_init_maxs.append(config.get_weight_init_max())
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute
+            #  `num_embeddings`.
             self._num_embeddings.append(config.num_embeddings)
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute `local_cols`.
             self._local_cols.append(config.local_cols)
             self._feature_table_map.extend([idx] * config.num_features())
+            # pyre-fixme[16]: `GroupedEmbeddingConfig` has no attribute `name`.
             if config.name not in self.table_name_to_count:
                 self.table_name_to_count[config.name] = 0
             self.table_name_to_count[config.name] += 1
diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py
index a99b745c5..389786641 100644
--- a/torchrec/distributed/benchmark/benchmark_utils.py
+++ b/torchrec/distributed/benchmark/benchmark_utils.py
@@ -6,6 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
+# pyre-ignore-all-errors[16]
 
 #!/usr/bin/env python3
 
@@ -431,6 +432,7 @@ def transform_module(
     compile_mode: CompileMode,
     world_size: int,
     batch_size: int,
+    # pyre-fixme[24]: Generic type `ContextManager` expects 1 type parameter.
     ctx: ContextManager,
     benchmark_unsharded_module: bool = False,
 ) -> torch.nn.Module:
@@ -1051,7 +1053,6 @@ def benchmark_module(
         for compile_mode in compile_modes:
             if not benchmark_unsharded:
                 # Test sharders should have a singular sharding_type
-                # pyre-ignore [16]
                 sharder._sharding_type = sharding_type.value
                 # pyre-ignore [6]
                 benchmark_type = benchmark_type_name(compile_mode, sharding_type)
diff --git a/torchrec/distributed/keyed_jagged_tensor_pool.py b/torchrec/distributed/keyed_jagged_tensor_pool.py
index 9571929a5..d11457171 100644
--- a/torchrec/distributed/keyed_jagged_tensor_pool.py
+++ b/torchrec/distributed/keyed_jagged_tensor_pool.py
@@ -667,9 +667,9 @@ def _update_local(
     ) -> None:
         raise NotImplementedError("Inference does not support update")
 
+    # pyre-fixme[7]: Expected `KeyedJaggedTensor` but got implicit return value of
+    #  `None`.
     def _update_preproc(self, values: KeyedJaggedTensor) -> KeyedJaggedTensor:
-        # pyre-fixme[7]: Expected `KeyedJaggedTensor` but got implicit return value
-        #  of `None`.
         pass
 
 
diff --git a/torchrec/distributed/object_pool.py b/torchrec/distributed/object_pool.py
index ad2903d1f..3ff78bc82 100644
--- a/torchrec/distributed/object_pool.py
+++ b/torchrec/distributed/object_pool.py
@@ -131,16 +131,16 @@ def input_dist(
         *input,
         # pyre-ignore[2]
         **kwargs,
+        # pyre-fixme[7]: Expected `Awaitable[Awaitable[Tensor]]` but got implicit return
+        #  value of `None`.
     ) -> Awaitable[Awaitable[torch.Tensor]]:
-        # pyre-fixme[7]: Expected `Awaitable[Awaitable[Tensor]]` but got implicit
-        #  return value of `None`.
         pass
 
+    # pyre-fixme[7]: Expected `DistOut` but got implicit return value of `None`.
     def compute(self, ctx: ShrdCtx, dist_input: torch.Tensor) -> DistOut:
-        # pyre-fixme[7]: Expected `DistOut` but got implicit return value of `None`.
         pass
 
+    # pyre-fixme[7]: Expected `LazyAwaitable[Out]` but got implicit return value of
+    #  `None`.
     def output_dist(self, ctx: ShrdCtx, output: DistOut) -> LazyAwaitable[Out]:
-        # pyre-fixme[7]: Expected `LazyAwaitable[Variable[Out]]` but got implicit
-        #  return value of `None`.
         pass
diff --git a/torchrec/distributed/planner/tests/test_partitioners.py b/torchrec/distributed/planner/tests/test_partitioners.py
index 44a71e2da..8f46066da 100644
--- a/torchrec/distributed/planner/tests/test_partitioners.py
+++ b/torchrec/distributed/planner/tests/test_partitioners.py
@@ -773,6 +773,8 @@ def test_different_sharding_plan(self) -> None:
             for shard in sharding_option.shards:
                 if shard.storage and shard.rank is not None:
                     greedy_perf_hbm_uses[
+                        # pyre-fixme[6]: For 1st argument expected `SupportsIndex`
+                        #  but got `Optional[int]`.
                         shard.rank
                     ] += shard.storage.hbm  # pyre-ignore[16]
 
@@ -796,6 +798,8 @@ def test_different_sharding_plan(self) -> None:
         for sharding_option in sharding_options:
             for shard in sharding_option.shards:
                 if shard.storage and shard.rank:
+                    # pyre-fixme[6]: For 1st argument expected `SupportsIndex` but
+                    #  got `Optional[int]`.
                     memory_balanced_hbm_uses[shard.rank] += shard.storage.hbm
 
         self.assertTrue(max(memory_balanced_hbm_uses) < max(greedy_perf_hbm_uses))
diff --git a/torchrec/distributed/shards_wrapper.py b/torchrec/distributed/shards_wrapper.py
index 723b346b2..15f0f65be 100644
--- a/torchrec/distributed/shards_wrapper.py
+++ b/torchrec/distributed/shards_wrapper.py
@@ -27,8 +27,6 @@
 aten = torch.ops.aten  # pyre-ignore[5]
 
 
-# pyre-fixme[13]: Attribute `_local_shards` is never initialized.
-# pyre-fixme[13]: Attribute `_storage_meta` is never initialized.
 class LocalShardsWrapper(torch.Tensor):
     """
     A wrapper class to hold local shards of a DTensor.
@@ -37,7 +35,9 @@ class LocalShardsWrapper(torch.Tensor):
     """
 
     __slots__ = ["_local_shards", "_storage_meta"]
+    # pyre-fixme[13]: Attribute `_local_shards` is never initialized.
     _local_shards: List[torch.Tensor]
+    # pyre-fixme[13]: Attribute `_storage_meta` is never initialized.
     _storage_meta: TensorStorageMetadata
 
     @staticmethod
diff --git a/torchrec/distributed/tensor_pool.py b/torchrec/distributed/tensor_pool.py
index 5496b9198..436851b9d 100644
--- a/torchrec/distributed/tensor_pool.py
+++ b/torchrec/distributed/tensor_pool.py
@@ -459,8 +459,8 @@ def _update_local(
             deduped_ids, dedup_permutation = deterministic_dedup(ids)
             shard.update(deduped_ids, values[dedup_permutation])
 
+    # pyre-fixme[7]: Expected `Tensor` but got implicit return value of `None`.
     def _update_preproc(self, values: torch.Tensor) -> torch.Tensor:
-        # pyre-fixme[7]: Expected `Tensor` but got implicit return value of `None`.
         pass
 
     def update(self, ids: torch.Tensor, values: torch.Tensor) -> None:
diff --git a/torchrec/distributed/tests/test_awaitable.py b/torchrec/distributed/tests/test_awaitable.py
index c62782690..790a56b12 100644
--- a/torchrec/distributed/tests/test_awaitable.py
+++ b/torchrec/distributed/tests/test_awaitable.py
@@ -24,6 +24,8 @@ def _wait_impl(self) -> torch.Tensor:
 class AwaitableTests(unittest.TestCase):
     def test_callback(self) -> None:
         awaitable = AwaitableInstance()
+        # pyre-fixme[6]: For 1st argument expected `(Tensor) -> Tensor` but got
+        #  `(ret: Any) -> int`.
         awaitable.callbacks.append(lambda ret: 2 * ret)
         self.assertTrue(
             torch.allclose(awaitable.wait(), torch.FloatTensor([2.0, 4.0, 6.0]))
@@ -31,6 +33,8 @@ def test_callback(self) -> None:
 
     def test_callback_chained(self) -> None:
         awaitable = AwaitableInstance()
+        # pyre-fixme[6]: For 1st argument expected `(Tensor) -> Tensor` but got
+        #  `(ret: Any) -> int`.
         awaitable.callbacks.append(lambda ret: 2 * ret)
         awaitable.callbacks.append(lambda ret: ret**2)
         self.assertTrue(
diff --git a/torchrec/distributed/tests/test_embedding_types.py b/torchrec/distributed/tests/test_embedding_types.py
index c2fb9b1f1..db9f660b7 100644
--- a/torchrec/distributed/tests/test_embedding_types.py
+++ b/torchrec/distributed/tests/test_embedding_types.py
@@ -29,9 +29,9 @@ def __init__(self) -> None:
             torch.nn.Module(),
         ]
 
+    # pyre-fixme[7]: Expected `EmbeddingBagCollectionContext` but got implicit
+    #  return value of `None`.
     def create_context(self) -> ShrdCtx:
-        # pyre-fixme[7]: Expected `EmbeddingBagCollectionContext` but got implicit
-        #  return value of `None`.
         pass
 
     def input_dist(
@@ -41,19 +41,18 @@ def input_dist(
         *input,
         # pyre-ignore[2]
         **kwargs,
-    ) -> Awaitable[Awaitable[CompIn]]:
         # pyre-fixme[7]: Expected `Awaitable[Awaitable[KJTList]]` but got implicit
         #  return value of `None`.
+    ) -> Awaitable[Awaitable[CompIn]]:
         pass
 
+    # pyre-fixme[7]: Expected `List[Tensor]` but got implicit return value of `None`.
     def compute(self, ctx: ShrdCtx, dist_input: CompIn) -> DistOut:
-        # pyre-fixme[7]: Expected `List[Tensor]` but got implicit return value of
-        #  `None`.
         pass
 
+    # pyre-fixme[7]: Expected `LazyAwaitable[Dict[str, Tensor]]` but got implicit
+    #  return value of `None`.
     def output_dist(self, ctx: ShrdCtx, output: DistOut) -> LazyAwaitable[Out]:
-        # pyre-fixme[7]: Expected `LazyAwaitable[Dict[str, Tensor]]` but got
-        #  implicit return value of `None`.
         pass
 
 
diff --git a/torchrec/distributed/tests/test_lazy_awaitable.py b/torchrec/distributed/tests/test_lazy_awaitable.py
index bc97a3b6d..2237e4887 100644
--- a/torchrec/distributed/tests/test_lazy_awaitable.py
+++ b/torchrec/distributed/tests/test_lazy_awaitable.py
@@ -244,8 +244,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         tempFile = None
         with tempfile.NamedTemporaryFile(delete=False) as f:
-            # pyre-fixme[6]: For 2nd argument expected `SupportsWrite[bytes]` but
-            #  got `_TemporaryFileWrapper[bytes]`.
             pickle.dump(gm, f)
             tempFile = f
 
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
index 27092bf11..2886b23eb 100644
--- a/torchrec/distributed/train_pipeline/train_pipelines.py
+++ b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -1613,6 +1613,7 @@ def __init__(
 
     def get_compiled_autograd_ctx(
         self,
+        # pyre-fixme[24]: Generic type `ContextManager` expects 1 type parameter.
     ) -> ContextManager:
         # this allows for pipelining
         # to avoid doing a sum on None
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
index dd51099b5..b6bdec3bf 100644
--- a/torchrec/distributed/types.py
+++ b/torchrec/distributed/types.py
@@ -44,7 +44,6 @@
     # other metaclasses (i.e. AwaitableMeta) for customized
     # behaviors, as Generic is non-trival metaclass in
     # python 3.6 and below
-    # pyre-fixme[21]: Could not find name `GenericMeta` in `typing` (stubbed).
     from typing import GenericMeta
 except ImportError:
     # In python 3.7+, GenericMeta doesn't exist as it's no
@@ -975,6 +974,9 @@ def __init__(
         torch._C._log_api_usage_once(f"torchrec.distributed.{self.__class__.__name__}")
         self._qcomm_codecs_registry = qcomm_codecs_registry
 
+    # pyre-fixme[56]: Pyre doesn't yet support decorators with ParamSpec applied to
+    #  generic functions. Consider using a context manager instead of a decorator, if
+    #  possible.
     @abc.abstractclassmethod
     # pyre-ignore [3]
     def shard(
diff --git a/torchrec/distributed/utils.py b/torchrec/distributed/utils.py
index e32782f2c..81bc77308 100644
--- a/torchrec/distributed/utils.py
+++ b/torchrec/distributed/utils.py
@@ -476,7 +476,11 @@ def maybe_reset_parameters(m: nn.Module) -> None:
 
 
 def maybe_annotate_embedding_event(
-    event: EmbeddingEvent, module_fqn: Optional[str], sharding_type: Optional[str]
+    event: EmbeddingEvent,
+    module_fqn: Optional[str],
+    sharding_type: Optional[str],
+    # pyre-fixme[24]: Generic type `AbstractContextManager` expects 2 type parameters,
+    #  received 1.
 ) -> AbstractContextManager[None]:
     if module_fqn and sharding_type:
         annotation = f"[{event.value}]_[{module_fqn}]_[{sharding_type}]"
diff --git a/torchrec/inference/inference_legacy/__init__.py b/torchrec/inference/inference_legacy/__init__.py
index 670f2af78..9546c65cc 100644
--- a/torchrec/inference/inference_legacy/__init__.py
+++ b/torchrec/inference/inference_legacy/__init__.py
@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-ignore-all-errors[0, 21]
+
 """Torchrec Inference
 
 Torchrec inference provides a Torch.Deploy based library for GPU inference.
diff --git a/torchrec/linter/module_linter.py b/torchrec/linter/module_linter.py
index 669aa7a80..6ce79ed0c 100644
--- a/torchrec/linter/module_linter.py
+++ b/torchrec/linter/module_linter.py
@@ -37,7 +37,9 @@ def print_error_message(
     """
     lint_item = {
         "path": python_path,
+        # pyre-fixme[16]: `AST` has no attribute `lineno`.
         "line": node.lineno,
+        # pyre-fixme[16]: `AST` has no attribute `col_offset`.
         "char": node.col_offset + 1,
         "severity": severity,
         "name": name,
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py
index eebd3fc39..c5968b463 100644
--- a/torchrec/metrics/tests/test_metric_module.py
+++ b/torchrec/metrics/tests/test_metric_module.py
@@ -528,6 +528,7 @@ def _test_adjust_compute_interval(
             )
         mock_time.time = MagicMock(return_value=0.0)
 
+        # pyre-fixme[53]: Captured variable `batch` is not annotated.
         def _train(metric_module: RecMetricModule) -> float:
             for _ in range(metric_module.compute_interval_steps):
                 metric_module.update(batch)
diff --git a/torchrec/modules/utils.py b/torchrec/modules/utils.py
index 75c6f2d82..0830f9800 100644
--- a/torchrec/modules/utils.py
+++ b/torchrec/modules/utils.py
@@ -40,6 +40,22 @@ def _fx_to_list(tensor: torch.Tensor) -> List[int]:
     return tensor.long().tolist()
 
 
+@torch.fx.wrap
+def _get_unflattened_lengths(lengths: torch.Tensor, num_features: int) -> torch.Tensor:
+    """
+    Unflatten lengths tensor from [F * B] to [F, B].
+    """
+    return lengths.view(num_features, -1)
+
+
+@torch.fx.wrap
+def _slice_1d_tensor(tensor: torch.Tensor, start: int, end: int) -> torch.Tensor:
+    """
+    Slice tensor.
+    """
+    return tensor[start:end]
+
+
 def extract_module_or_tensor_callable(
     module_or_callable: Union[
         Callable[[], torch.nn.Module],
@@ -133,6 +149,8 @@ def convert_list_of_modules_to_modulelist(
         #  `Iterable[torch.nn.Module]`.
         len(modules)
         == sizes[0]
+        # pyre-fixme[6]: For 1st argument expected `pyre_extensions.PyreReadOnly[Sized]`
+        #  but got `Iterable[Module]`.
     ), f"the counts of modules ({len(modules)}) do not match with the required counts {sizes}"
     if len(sizes) == 1:
         return torch.nn.ModuleList(modules)
@@ -290,20 +308,27 @@ def construct_jagged_tensors_inference(
     need_indices: bool = False,
     features_to_permute_indices: Optional[Dict[str, List[int]]] = None,
     reverse_indices: Optional[torch.Tensor] = None,
+    remove_padding: bool = False,
 ) -> Dict[str, JaggedTensor]:
     with record_function("## construct_jagged_tensors_inference ##"):
+        # [F * B] -> [F, B]
+        unflattened_lengths = _get_unflattened_lengths(lengths, len(embedding_names))
+
         if reverse_indices is not None:
             embeddings = torch.index_select(
                 embeddings, 0, reverse_indices.to(torch.int32)
             )
+        elif remove_padding:
+            embeddings = _slice_1d_tensor(
+                embeddings, 0, unflattened_lengths.sum().item()
+            )
 
         ret: Dict[str, JaggedTensor] = {}
-        length_per_key: List[int] = _fx_to_list(
-            torch.sum(lengths.view(len(embedding_names), -1), dim=1)
-        )
 
-        lengths = lengths.view(len(embedding_names), -1)
-        lengths_tuple = torch.unbind(lengths, dim=0)
+        length_per_key: List[int] = _fx_to_list(torch.sum(unflattened_lengths, dim=1))
+
+        lengths_tuple = torch.unbind(unflattened_lengths, dim=0)
+
         embeddings_list = torch.split(embeddings, length_per_key, dim=0)
         values_list = torch.split(values, length_per_key) if need_indices else None
 
diff --git a/torchrec/quant/embedding_modules.py b/torchrec/quant/embedding_modules.py
index e8a2f10e8..0e28a1fe3 100644
--- a/torchrec/quant/embedding_modules.py
+++ b/torchrec/quant/embedding_modules.py
@@ -79,6 +79,10 @@
     "__emb_name_to_num_rows_post_pruning"
 )
 
+MODULE_ATTR_REMOVE_STBE_PADDING_BOOL: str = "__remove_stbe_padding"
+
+MODULE_ATTR_USE_BATCHING_HINTED_OUTPUT_BOOL: str = "__use_batching_hinted_output"
+
 DEFAULT_ROW_ALIGNMENT = 16
 
 
@@ -894,7 +898,8 @@ def forward(
                 if self.register_tbes
                 else emb_module.forward(indices=indices, offsets=offsets)
             )
-            lookup = _get_batching_hinted_output(lengths=lengths, output=lookup)
+            if getattr(self, MODULE_ATTR_USE_BATCHING_HINTED_OUTPUT_BOOL, True):
+                lookup = _get_batching_hinted_output(lengths=lengths, output=lookup)
             embedding_names = self._embedding_names_by_batched_tables[key]
             jt = construct_jagged_tensors_inference(
                 embeddings=lookup,
@@ -902,6 +907,9 @@ def forward(
                 values=indices,
                 embedding_names=embedding_names,
                 need_indices=self.need_indices(),
+                remove_padding=getattr(
+                    self, MODULE_ATTR_REMOVE_STBE_PADDING_BOOL, False
+                ),
             )
             for embedding_name in embedding_names:
                 feature_embeddings[embedding_name] = jt[embedding_name]