2d_hist.py 4.38 KB
Newer Older
mashun1's avatar
mashun1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import json
import os
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from multiprocessing import Pool
import functools
import argparse


def load_data(json_path):
    with open(json_path, "r") as f:
        return json.load(f)


def filter_data(data):
    filtered_data = [item for item in data if "image" in item]
    return filtered_data


def calculate_image_dimension(image_path, images_folder):
    full_path = os.path.join(images_folder, image_path)
    try:
        with Image.open(full_path) as img:
            width, height = img.size
            return width, height
    except Exception as e:
        print(f"Error opening {full_path}: {e}")
        return None, None


def calculate_image_dimensions_multiprocess(filtered_data, images_folder, num_processes=256):
    image_paths = []
    for item in filtered_data:
        if isinstance(item["image"], list):
            image_paths.extend(item["image"])
        else:
            image_paths.append(item["image"])

    with Pool(num_processes) as p:
        dimensions = list(
            tqdm(
                p.imap(functools.partial(calculate_image_dimension, images_folder=images_folder), image_paths),
                total=len(image_paths),
                desc="Calculating image dimensions",
            )
        )
    widths, heights = zip(*[dim for dim in dimensions if dim[0] is not None])
    return list(widths), list(heights)


def tokenize(text):
    return text.split()


def calculate_tokenized_lengths(data):
    lengths = []
    for item in tqdm(data, desc="Tokenizing conversations"):
        for conversation in item["conversations"]:
            tokenized_value = tokenize(conversation["value"])
            lengths.append(len(tokenized_value))
    return lengths


def main():
    parser = argparse.ArgumentParser(description="Process data for LLaVA_Next project.")
    parser.add_argument(
        "--json_path",
        type=str,
        help="Path to the JSON file containing data.",
        default="/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_DEMON-FULL.json",
    )
    parser.add_argument(
        "--images_folder",
        type=str,
        default="/mnt/bn/vl-research/data/llava_data",
        help="Path to the folder containing images.",
    )
    args = parser.parse_args()

    llava_instruct_name = os.path.basename(args.json_path).replace(".json", "")
    images_folder = args.images_folder

    data = load_data(args.json_path)
    filtered_data = filter_data(data)

    print(f"Total data items: {len(data)}, Filtered data items: {len(filtered_data)}")
    widths, heights = calculate_image_dimensions_multiprocess(filtered_data, images_folder)
    max_width, max_height = max(widths), max(heights)
    print(f"Max width: {max_width}, Max height: {max_height}")

    tokenized_lengths = calculate_tokenized_lengths(filtered_data)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 12))

    # Plot 2D histogram
    widths_bins = [min(widths), max(widths) + 1] if min(widths) == max(widths) else np.arange(min(widths), max(widths) + 100, 100)
    heights_bins = [min(heights), max(heights) + 1] if min(heights) == max(heights) else np.arange(min(heights), max(heights) + 100, 100)

    h, xedges, yedges, image = ax1.hist2d(widths, heights, bins=[widths_bins, heights_bins], cmap=plt.cm.jet, density=True)
    fig.colorbar(image, ax=ax1)
    ax1.set_xlabel("Width")
    ax1.set_ylabel("Height")
    ax1.set_title(
        f"dist_{llava_instruct_name}_2d_w_h\nMax width: {max(widths)}, Max height: {max(heights)}",
        fontsize=10,
    )

    # Plot histogram
    hist, bin_edges = np.histogram(tokenized_lengths, bins=np.arange(0, max(tokenized_lengths) + 10, 10))
    bins = np.arange(0, max(tokenized_lengths) + 10, 10)
    ax2.bar(bin_edges[:-1], hist, width=7, edgecolor="black", log=True)

    # Display every nth label on the x-axis
    n = 8  # Adjust this value to control the number of labels displayed
    ticks = bins[::n]
    tick_labels = [int(tick) for tick in ticks]
    ax2.set_xticks(ticks)
    ax2.set_xticklabels(tick_labels, rotation=90, fontsize=8)

    ax2.set_xlim(min(bin_edges), max(bin_edges))
    ax2.set_xlabel("Tokenized Length")
    ax2.set_ylabel("Count (log scale)")
    ax2.set_title(f"dist_{llava_instruct_name}_tokenized_length", fontsize=8)

    plt.tight_layout()
    plt.savefig(f"./dist_{llava_instruct_name}_combined.png")


if __name__ == "__main__":
    main()