[Doc]: fix typos in Python comments (#24417)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
@@ -302,7 +302,7 @@ class FusedMoEPrepareAndFinalize(ABC):
|
||||
def max_num_tokens_per_rank(self) -> Optional[int]:
|
||||
"""
|
||||
Some PrepareFinalize All2All implementations are batched. Meaning,
|
||||
they can processes only as set of tokens at a time. This
|
||||
they can process only as set of tokens at a time. This
|
||||
function returns the batch size i.e the maximum number of tokens
|
||||
the implementation can process at a time.
|
||||
Return None if there are no such restrictions.
|
||||
|
||||
@@ -201,7 +201,7 @@ def marlin_make_workspace(output_size_per_partition: int,
|
||||
def marlin_make_workspace_new(device: torch.device,
|
||||
max_blocks_per_sm: int = 1) -> torch.Tensor:
|
||||
# In the new marlin kernel, we use the num of threadblocks as workspace
|
||||
# size. The num of threadblocks is is sms_count * max_blocks_per_sm.
|
||||
# size. The num of threadblocks is sms_count * max_blocks_per_sm.
|
||||
sms = torch.cuda.get_device_properties(device).multi_processor_count
|
||||
return torch.zeros(sms * max_blocks_per_sm,
|
||||
dtype=torch.int,
|
||||
|
||||
Reference in New Issue
Block a user