Addressing comments

055a673e · rprenger · 279d8320 · 279d8320 · 055a673e · 055a673e
Commit 055a673e authored Aug 25, 2021 by rprenger
7 changed files
--- a/examples/run_cli_530B.sh
+++ b/examples/run_cli_530B.sh
-#!/bin/bash
-echo "Loading model and starting server.  May take several minutes"
-./run_api_server_530B.sh
-STATUS = 1
-while [ $STATUS -eq 1]
-do
-	sleep 20
-	curl -s -m 20 'http://localhost:5000/generate' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"sentences":["Test2"], "max_len":30}' | head -n 1 | grep "HTTP/1.[01] [23].." > /dev/null
-	STATUS = $?
-done
-python tools/run_cli.py 'http://localhost:5000/generate' 
--- a/examples/run_api_server_530B.sh
+++ b/examples/run_api_server_530B.sh
 #!/bin/bash
-DISTRIBUTED_ARGS="--nproc_per_node 16 \
-                  --nnodes 3 \
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"

-CHECKPOINT=<Path to checkpoint (e.g /gpt3-530b-megatron_tp16_pp3)>
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
 VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
 MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>

 pip install flask-restful

-python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_api_server.py   /
-       --tensor-model-parallel-size 16  /
-       --pipeline-model-parallel-size 3  /
-       --num-layers 105  /
-       --hidden-size 20480  /
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   /
+       --tensor-model-parallel-size 1  /
+       --pipeline-model-parallel-size 1  /
+       --num-layers 24  /
+       --hidden-size 1024  /
       --load ${CHECKPOINT}  /
-       --num-attention-heads 128  /
-       --max-position-embeddings 2048  /
+       --num-attention-heads 16  /
+       --max-position-embeddings 1024  /
       --tokenizer-type GPT2BPETokenizer  /
       --fp16  /
       --micro-batch-size 1  /
-       --seq-length 2048  /
-       --out-seq-length 2048  /
+       --seq-length 1024  /
+       --out-seq-length 1024  /
       --temperature 1.0  /
       --vocab-file $VOCAB_FILE  /
       --merge-file $MERGE_FILE  /

--- a/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   /
+       --tensor-model-parallel-size 8  /
+       --pipeline-model-parallel-size 1  /
+       --num-layers 24  /
+       --hidden-size 1024  /
+       --load ${CHECKPOINT}  /
+       --num-attention-heads 16  /
+       --max-position-embeddings 1024  /
+       --tokenizer-type GPT2BPETokenizer  /
+       --fp16  /
+       --micro-batch-size 1  /
+       --seq-length 1024  /
+       --out-seq-length 1024  /
+       --temperature 1.0  /
+       --vocab-file $VOCAB_FILE  /
+       --merge-file $MERGE_FILE  /
+       --top_p 0.9  /
+	   --seed 42
--- a/megatron/api_server.py
+++ b/megatron/api_server.py
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -121,14 +121,14 @@ def receive_generate_info():
    """
    Needs to be synced up with send_generate_info
    """
-    input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.device("cuda"))
+    input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.cuda.current_device())
    torch.distributed.broadcast(input_info_tensor, 0)
    batch_size = input_info_tensor[0].item()
    seq_len = input_info_tensor[1].item()
    max_len = input_info_tensor[2].item()
    
-    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.device("cuda"))
-    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.device("cuda"))
+    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
+    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device())
    
    # Send variables to all ranks 
    torch.distributed.broadcast(context_length_tensor, 0)
@@ -153,9 +153,6 @@ def synced_generate(model, context_tokens_tensor, context_length_tensor, max_len
 def generate(model, sentences=None, max_len=0):
    if torch.distributed.get_rank() == 0:
        context_tokens_tensor, context_length_tensor = tokenize_batch(sentences)
-        c = context_length_tensor[0]
-        b = context_tokens_tensor.size(0)
-        start = time.time()
        send_generate_info(context_tokens_tensor, context_length_tensor, max_len)
    else:
        context_length_tensor, context_tokens_tensor, max_len = receive_generate_info()
@@ -169,8 +166,6 @@ def generate(model, sentences=None, max_len=0):
        for i in range(decode_tokens.size(0)):
            decode_token = decode_tokens[i,:].cpu().numpy().tolist()
            resp_sentences.append(tokenizer.detokenize(decode_token))
-        end = time.time()
-        print(str(b)+","+str(c)+","+str(decode_tokens.size(1))+","+str(end-start), flush=True)
        return resp_sentences

 def switch(val1, val2, boolean):

--- a/tools/run_api_server.py
+++ b/tools/run_api_server.py
@@ -26,7 +26,7 @@ from megatron.checkpointing import load_checkpoint
 from megatron.initialize import initialize_megatron
 from megatron.model import GPTModel
 from megatron.training import get_model
-from megatron.api_server import MegatronServer
+from megatron.text_generation_server import MegatronServer
 from megatron.text_generation_utils import generate
 import torch


--- a/tools/run_cli.py
+++ b/tools/run_cli.py