srt_example_llava_v.sh 3.9 KB
Newer Older
Yuanhan Zhang's avatar
Yuanhan Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash

##### USAGE #####
#    - First node:
#      ```sh
#      bash examples/quick_start/srt_example_llava_v.sh K 0 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```
#    - Second node:
#      ```sh
#      bash examples/quick_start/srt_example_llava_v.sh K 1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```
#    - The K node:
#      ```sh
#      bash examples/quick_start/srt_example_llava_v.sh K K-1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```


# Replace `K`, `YOUR_VIDEO_PATH`, `YOUR_MODEL_PATH`, and `FRAMES_PER_VIDEO` with your specific details.
CURRENT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

echo ${CURRENT_ROOT}

cd ${CURRENT_ROOT}

export PYTHONWARNINGS=ignore

START_TIME=$(date +%s)  # Capture start time

NUM_NODES=$1

CUR_NODES_IDX=$2

VIDEO_DIR=$3

MODEL_PATH=$4   

NUM_FRAMES=$5


# FRAME_FORMAT=$6

# FRAME_FORMAT=$(echo $FRAME_FORMAT | tr '[:lower:]' '[:upper:]')

# # Check if FRAME_FORMAT is either JPEG or PNG
# if [[ "$FRAME_FORMAT" != "JPEG" && "$FRAME_FORMAT" != "PNG" ]]; then
#     echo "Error: FRAME_FORMAT must be either JPEG or PNG."
#     exit 1
# fi

# export TARGET_FRAMES=$TARGET_FRAMES

echo "Each video you will sample $NUM_FRAMES frames"

# export FRAME_FORMAT=$FRAME_FORMAT

# echo "The frame format is $FRAME_FORMAT"

# Assuming GPULIST is a bash array containing your GPUs
GPULIST=(0 1 2 3 4 5 6 7)
LOCAL_CHUNKS=${#GPULIST[@]}

echo "Number of GPUs in GPULIST: $LOCAL_CHUNKS"

ALL_CHUNKS=$((NUM_NODES * LOCAL_CHUNKS))

# Calculate GPUs per chunk
GPUS_PER_CHUNK=8

echo $GPUS_PER_CHUNK

for IDX in $(seq 1 $LOCAL_CHUNKS); do
    (
        START=$(((IDX-1) * GPUS_PER_CHUNK))
        LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
        
        CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
        
        # Convert the chunk GPUs array to a comma-separated string
        CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")

        LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))

        echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"
        
        # Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
        PORT=$((10000 + RANDOM % 55536))

        MAX_RETRIES=10
        RETRY_COUNT=0
        COMMAND_STATUS=1  # Initialize as failed

        while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
            echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"
            
#!/bin/bash
            CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 examples/usage/llava_video/srt_example_llava_v.py \
            --port $PORT \
            --num-chunks $ALL_CHUNKS \
            --chunk-idx $(($LOCAL_IDX - 1)) \
            --save-dir work_dirs/llava_next_video_inference_results \
            --video-dir $VIDEO_DIR \
            --model-path $MODEL_PATH \
            --num-frames $NUM_FRAMES #&
            
            wait $!  # Wait for the process to finish and capture its exit status
            COMMAND_STATUS=$?
            
            if [ $COMMAND_STATUS -ne 0 ]; then
                echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
                RETRY_COUNT=$(($RETRY_COUNT + 1))
                sleep 180  # Wait a bit before retrying
            else
                echo "Execution succeeded for chunk $(($LOCAL_IDX - 1))."
            fi
        done

        if [ $COMMAND_STATUS -ne 0 ]; then
            echo "Execution failed for chunk $(($LOCAL_IDX - 1)) after $MAX_RETRIES attempts."
        fi
    ) #&
    sleep 2  # Slight delay to stagger the start times
done

wait

cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv   

END_TIME=$(date +%s)  # Capture end time
ELAPSED_TIME=$(($END_TIME - $START_TIME))
echo "Total execution time: $ELAPSED_TIME seconds."