generate_data.py 4.38 KB
Newer Older
root's avatar
root committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import tensorflow.compat.v1 as tf
from copy import deepcopy
import csv
import json
import os


DTYPE = {
    'float32': tf.float32, 
    'int32': tf.int32, 
    'int64': tf.int64,
    'string': tf.string
}


def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Directory '{path}' created.")
    else:
        print(f"Directory '{path}' already exists.")


def convert_nested_array_dtype(arr):

    converted_arr = []
    for element in arr:
        if isinstance(element, list):
            # 如果元素是一个列表,则递归调用自身
            converted_arr.append(convert_nested_array_dtype(element))
        else:
            # 否则保持原样
            converted_arr.append(element.decode('utf-8'))
    return converted_arr
           

def generate_datas(tensors, batch_size):
    graph_datas = {}
    for tensor in tensors:
        _dtype = DTYPE[tensor[-1]]
        shapes = deepcopy(tensor[2])
        if shapes == ['']:
            graph_datas[(tensor[1])] = tf.constant(3.14, dtype=_dtype)
            continue
        for i in range(len(shapes)):
            if shapes[i] == "None":
                shapes[i] = batch_size
            if shapes[i] != "":
                shapes[i] = int(shapes[i])
        
        shapes = tuple([i for i in shapes])
        
        
        if tensor[-1] == "int32":
            random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)

        elif tensor[-1] == "int64":
            random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
    
        elif tensor[-1] == "string":
            # 生成字符串张量
            batch_size = shapes[0]
            sequence_length = shapes[1] if len(shapes) > 1 else 1
            random_tensor = tf.constant([["example_string"] * sequence_length] * batch_size, dtype=tf.string)
        else:
            random_tensor = tf.random.normal(shape=shapes, mean=0.0, stddev=1.0, dtype=_dtype)

        
        graph_datas[(tensor[1])] = random_tensor

    return graph_datas


def read_csv_data(file_path):
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        datas = list(reader)
    for data in datas:
        data[2] = data[2][1:-1].split(",")
    
    return datas


def save_graph_datasets_json(input_tensors, output_tensors, batch_size, input_data_json_path, output_data_json_path):

    input_graph_datas = generate_datas(input_tensors, batch_size)
    output_graph_datas = generate_datas(output_tensors, batch_size)
    
    feed_dict = {}
    output_dict = {}
    # feed_dict = {key: value.numpy().tolist() for key, value in input_graph_datas.items()}
    # output_dict = {key: value.numpy().tolist() for key, value in output_graph_datas.items()}
    for key, value in input_graph_datas.items():
        if value.dtype == tf.string:
            value = value.numpy().tolist()
            value = convert_nested_array_dtype(value)
        else:
            value = value.numpy().tolist()
        feed_dict[key] = value

    for key, value in output_graph_datas.items():
        if value.dtype == tf.string:
            value = value.numpy().tolist()
            value = convert_nested_array_dtype(value)
        else:
            value = value.numpy().tolist()
        output_dict[key] = value

    with open(input_data_json_path, 'w') as f:
        json.dump(feed_dict, f, indent=4)
    
    with open(output_data_json_path, 'w') as f:
        json.dump(output_dict, f, indent=4)


if __name__ == '__main__':
    
    model = "model_1"         # 测试模型 name
    model_dir = "./models"    # 模型目录
    dataset_path = os.path.join(model_dir, f'{model}/dataset')
   
    input_tensors_path = os.path.join(model_dir, f'{model}/input_tensors.csv')
    output_tensors_path = os.path.join(model_dir, f'{model}/output_tensors.csv')  
    input_tensors = read_csv_data(input_tensors_path)
    output_tensors = read_csv_data(output_tensors_path)
    
    create_directory(dataset_path)
    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
        print("batch_size:", batch_size)
        input_data_json_path = os.path.join(dataset_path, f'input_tensor_datas_{batch_size}.json')
        output_data_json_path = os.path.join(dataset_path, f'output_tensor_datas_{batch_size}.json')
        save_graph_datasets_json(input_tensors, output_tensors, batch_size, input_data_json_path, output_data_json_path)