Commit 7d224eb2 authored by lizhigong's avatar lizhigong
Browse files

rm debug log

parent 0ecda6d1
......@@ -48,13 +48,13 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
f"is less than than max local gpu count ({cuda_device_count})")
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
if "CUDA_VISIBLE_DEVICES" or "HIP_VISIBLE_DEVICES" not in os.environ:
update_environment_variables({
"CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
})
update_environment_variables({
"HIP_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
})
# if "CUDA_VISIBLE_DEVICES" or "HIP_VISIBLE_DEVICES" not in os.environ:
# update_environment_variables({
# "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
# })
# update_environment_variables({
# "HIP_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
# })
def _init_executor(self) -> None:
......
......@@ -746,7 +746,6 @@ def _sample_with_torch(
else:
raise ValueError(f"Unsupported sampling type: {sampling_type}")
print('###sampled_token_ids', sampled_token_ids_)
# Encapsulate arguments for computing Pythonized sampler
# results, whether deferred or otherwise.
maybe_deferred_args = SampleResultArgsType(
......
......@@ -910,7 +910,6 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
accepted_token_ids, target_logprobs, select_indices_list, accept_lengths = self._verify_tokens(
execute_model_req.seq_group_metadata_list, proposal_scores,
proposals, execute_model_req.num_lookahead_slots)
print('###accepted_token_ids', accepted_token_ids)
# move kv_caches of selected tokens to right positions
if self.tree_decoding:
......@@ -1341,7 +1340,6 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
self._maybe_log_stage_times(*stage_times)
# First `n_prefills` entries will contain prefills SamplerOutput when
# chunked prefill is enabled, the rest is decodes in multi-step format.
print('###sampler_output_list', sampler_output_list)
return sampler_output_list
def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,
......
......@@ -902,7 +902,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Tokens and positions.
if cuda_graph_pad_size:
input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
print('###input_tokens', input_tokens)
assert self.runner.device is not None
input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
self.runner.device,
......@@ -917,14 +916,12 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for idx in range(3):
mrope_input_positions[idx].extend(
itertools.repeat(0, cuda_graph_pad_size))
print('###mrope_input_positions', mrope_input_positions)
input_positions_tensor = async_tensor_h2d(mrope_input_positions,
torch.long,
self.runner.device,
self.runner.pin_memory)
else:
input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
print('###input_positions', input_positions)
input_positions_tensor = async_tensor_h2d(input_positions,
torch.long,
self.runner.device,
......@@ -932,7 +929,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Sequence and query lengths.
if cuda_graph_pad_size:
seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
print('###seq_lens', seq_lens)
# Attention metadata.
attn_metadata = self.attn_metadata_builder.build(
......@@ -1006,7 +1002,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prompt_adapter_mapping=prompt_adapter_mapping,
prompt_adapter_requests=prompt_adapter_requests)
print('###model_input', ret)
return ret
......
......@@ -92,7 +92,6 @@ class ZeroOverheadModelInputForGpuBuilder(ModelInputForGPUBuilder):
def build(self) -> ModelInputForGPU:
model_input = super().build()
print('###model_input', model_input)
last_sampler = get_last_sampler()
spec_step = get_spec_step()
last_step = get_spec_last_step()
......@@ -167,5 +166,4 @@ class ZeroOverheadModelInputForGpuBuilder(ModelInputForGPUBuilder):
)
print('###zero_model_input', model_input)
return model_input
......@@ -359,7 +359,6 @@ def _sample_with_torch(
sampled_token_ids_tensor[long_sample_indices] = \
multinomial_samples[sampling_type].to(torch.long)
print('###sampled_token_ids', last_sampler.sampled_token_ids_tensor)
# Encapsulate arguments for computing Pythonized sampler
# results, whether deferred or otherwise.
maybe_deferred_args = SampleResultArgsType(
......
......@@ -545,7 +545,6 @@ class ZeroOverheadSpecDecodeWorker(SpecDecodeWorker):
self._maybe_log_stage_times(*stage_times)
# First `n_prefills` entries will contain prefills SamplerOutput when
# chunked prefill is enabled, the rest is decodes in multi-step format.
print('###sampler_output_list', sampler_output_list)
return sampler_output_list
def _track_sequences_with_bonus_tokens(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment