Unverified Commit 71321a07 authored by digger yu's avatar digger yu Committed by GitHub
Browse files

fix typo change dosen't to doesn't (#5308)

parent 6a3086a5
...@@ -49,7 +49,7 @@ class FalconPolicy(Policy): ...@@ -49,7 +49,7 @@ class FalconPolicy(Policy):
if not self.model.config.new_decoder_architecture and self.model.config.multi_query: if not self.model.config.new_decoder_architecture and self.model.config.multi_query:
warnings.warn( warnings.warn(
"Falcon dosen't support tensor parallelism when (not new_decoder_architecture and multi_query) is True, will ignore the tensor parallelism flag." "Falcon doesn't support tensor parallelism when (not new_decoder_architecture and multi_query) is True, will ignore the tensor parallelism flag."
) )
self.shard_config.enable_tensor_parallelism = False self.shard_config.enable_tensor_parallelism = False
......
...@@ -46,7 +46,7 @@ class LlamaPolicy(Policy): ...@@ -46,7 +46,7 @@ class LlamaPolicy(Policy):
if self.shard_config.enable_sequence_parallelism: if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False self.shard_config.enable_sequence_parallelism = False
warnings.warn("Llama dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") warnings.warn("Llama doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
decoder_attribute_replacement = { decoder_attribute_replacement = {
......
...@@ -35,7 +35,7 @@ class MistralPolicy(Policy): ...@@ -35,7 +35,7 @@ class MistralPolicy(Policy):
if self.shard_config.enable_sequence_parallelism: if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False self.shard_config.enable_sequence_parallelism = False
warnings.warn( warnings.warn(
"Mistral dosen't support sequence parallelism now, will ignore the sequence parallelism flag." "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
) )
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
...@@ -136,7 +136,7 @@ class MistralModelPolicy(MistralPolicy): ...@@ -136,7 +136,7 @@ class MistralModelPolicy(MistralPolicy):
def module_policy(self): def module_policy(self):
if self.pipeline_stage_manager: if self.pipeline_stage_manager:
warnings.warn("Mistral dosen't support pipeline parallelism now.") warnings.warn("Mistral doesn't support pipeline parallelism now.")
return super().module_policy() return super().module_policy()
...@@ -160,7 +160,7 @@ class MistralForCausalLMPolicy(MistralPolicy): ...@@ -160,7 +160,7 @@ class MistralForCausalLMPolicy(MistralPolicy):
} }
if self.pipeline_stage_manager: if self.pipeline_stage_manager:
warnings.warn("Mistral dosen't support pipeline parallelism now.") warnings.warn("Mistral doesn't support pipeline parallelism now.")
policy.update(new_item) policy.update(new_item)
...@@ -186,7 +186,7 @@ class MistralForSequenceClassificationPolicy(MistralPolicy): ...@@ -186,7 +186,7 @@ class MistralForSequenceClassificationPolicy(MistralPolicy):
} }
if self.pipeline_stage_manager: if self.pipeline_stage_manager:
warnings.warn("Mistral dosen't support pipeline parallelism now.") warnings.warn("Mistral doesn't support pipeline parallelism now.")
policy.update(new_item) policy.update(new_item)
return policy return policy
...@@ -59,7 +59,7 @@ class OPTPolicy(Policy): ...@@ -59,7 +59,7 @@ class OPTPolicy(Policy):
if self.shard_config.enable_sequence_parallelism: if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False self.shard_config.enable_sequence_parallelism = False
warnings.warn("OPT dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") warnings.warn("OPT doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
policy[OPTDecoder] = ModulePolicyDescription( policy[OPTDecoder] = ModulePolicyDescription(
......
...@@ -66,7 +66,7 @@ class T5BasePolicy(Policy): ...@@ -66,7 +66,7 @@ class T5BasePolicy(Policy):
if self.shard_config.enable_sequence_parallelism: if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False self.shard_config.enable_sequence_parallelism = False
warnings.warn("T5 dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") warnings.warn("T5 doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
policy[T5Stack] = ModulePolicyDescription( policy[T5Stack] = ModulePolicyDescription(
...@@ -263,7 +263,7 @@ class T5BasePolicy(Policy): ...@@ -263,7 +263,7 @@ class T5BasePolicy(Policy):
if num_decoder_layers == 0: if num_decoder_layers == 0:
return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages
# the number of stages distributed between encoder and decoder is optmized in this way: # the number of stages distributed between encoder and decoder is optimized in this way:
# num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages)) # num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages))
# s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1 # s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1
def objective(num_encoder_stages): def objective(num_encoder_stages):
......
...@@ -33,7 +33,7 @@ class ViTPolicy(Policy): ...@@ -33,7 +33,7 @@ class ViTPolicy(Policy):
if self.shard_config.enable_sequence_parallelism: if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False self.shard_config.enable_sequence_parallelism = False
warnings.warn("Vit dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") warnings.warn("Vit doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
policy[ViTEmbeddings] = ModulePolicyDescription( policy[ViTEmbeddings] = ModulePolicyDescription(
......
...@@ -69,13 +69,13 @@ class WhisperPolicy(Policy): ...@@ -69,13 +69,13 @@ class WhisperPolicy(Policy):
if self.shard_config.enable_sequence_parallelism: if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False self.shard_config.enable_sequence_parallelism = False
warnings.warn( warnings.warn(
"Whisper dosen't support sequence parallelism now, will ignore the sequence parallelism flag." "Whisper doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
) )
# TODO using the jit fused add_and_dropout affect the accuracy # TODO using the jit fused add_and_dropout affect the accuracy
if self.shard_config.enable_jit_fused: if self.shard_config.enable_jit_fused:
self.shard_config.enable_jit_fused = False self.shard_config.enable_jit_fused = False
warnings.warn("Whisper dosen't support jit fused operator now, will ignore the jit fused operator flag.") warnings.warn("Whisper doesn't support jit fused operator now, will ignore the jit fused operator flag.")
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
policy[WhisperEncoderLayer] = ModulePolicyDescription( policy[WhisperEncoderLayer] = ModulePolicyDescription(
...@@ -302,7 +302,7 @@ class WhisperPolicy(Policy): ...@@ -302,7 +302,7 @@ class WhisperPolicy(Policy):
if num_decoder_layers == 0: if num_decoder_layers == 0:
return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages
# the number of stages distributed between encoder and decoder is optmized in this way: # the number of stages distributed between encoder and decoder is optimized in this way:
# num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages)) # num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages))
# s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1 # s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1
def objective(num_encoder_stages): def objective(num_encoder_stages):
......
...@@ -43,7 +43,7 @@ class OpenMoePolicy(Policy): ...@@ -43,7 +43,7 @@ class OpenMoePolicy(Policy):
if self.shard_config.enable_sequence_parallelism: if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False self.shard_config.enable_sequence_parallelism = False
raise NotImplementedError( raise NotImplementedError(
"openmoe dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") "openmoe doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.") raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment