panduan.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

def process_data(filename):
    """处理数据文件，返回处理后的数据字典和数据大小列表"""
    data = {}

    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(',')
            # 新格式: operation, algo, proto, nbytes, timeUsec
            if len(parts) >= 5:
                try:
                    operation = parts[0].strip()
                    algo = parts[1].strip()
                    proto = parts[2].strip()
                    size_bytes = int(parts[3].strip())
                    value = float(parts[4].strip())                    
                    key = (operation, algo, proto)
                    data.setdefault(key, []).append((size_bytes, value))
                except (ValueError, IndexError):
                    continue

    # 对每组数据两两取最小值，并且同时保存 size
    arrays = {}
    sizes_dict = {}
    for key in data:
        compressed_vals = []
        compressed_sizes = []
        values = data[key]
        for i in range(0, len(values)-1, 2):
            size1, v1 = values[i]
            size2, v2 = values[i+1]
            # 保证 size 一致才压缩
            if size1 == size2:
                compressed_vals.append(min(v1, v2))
                compressed_sizes.append(size1)
        arrays[key] = compressed_vals
        sizes_dict[key] = compressed_sizes

    return arrays, sizes_dict


def analyze_results(arrays, sizes_dict):
    results = []

    # 自动识别所有操作类型
    operations = sorted(set(key[0] for key in arrays.keys()))

    for operation in operations:
        # 寻找该 operation 下所有的 key
        keys_for_op = [k for k in arrays.keys() if k[0] == operation]
        if not keys_for_op:
            continue

        # 期望的基线 key
        baseline_key = (operation, "Default", "Default")
        if baseline_key not in arrays:
            # 优先找 algo == "Default"
            cand = next((k for k in keys_for_op if k[1] == "Default"), None)
            if cand:
                baseline_key = cand
                print(f"Info: 对于操作 {operation}，未找到 (Default,Default) 基线，使用 {baseline_key} 作为基线（找到 algo=='Default'）。")
            else:
                # 否则选择该 operation 下样本点最多的 key 作为基线（长度最长）
                cand = max(keys_for_op, key=lambda k: len(arrays.get(k, [])))
                baseline_key = cand
                print(f"Info: 对于操作 {operation}，未找到 (Default,Default) 基线，回退使用 {baseline_key} 作为基线（样本点最多）。")

        # 作为比较对象的类型：该 operation 下除基线外的所有组合
        compare_types = [k for k in keys_for_op if k != baseline_key]

        # 如果没有其他组合可比，则跳过
        if not compare_types:
            print(f"Warning: 操作 {operation} 没有可比较的类型（除基线 {baseline_key} 外）。跳过。")
            continue

        # 对齐长度：基线与所有比较类型的最小长度
        try:
            min_len = min(len(arrays.get(baseline_key, [])), *(len(arrays.get(t, [])) for t in compare_types))
        except ValueError:
            # 意外情况（比如某些列表为空），跳过
            print(f"Warning: 操作 {operation} 数据长度对齐失败，跳过。")
            continue
 
        if min_len == 0:
            print(f"Warning: 操作 {operation} 基线或比较类型没有有效数据点（min_len=0），跳过。")
            continue

        baseline = arrays[baseline_key][:min_len]
        sizes = sizes_dict.get(baseline_key, [])[:min_len]

        # 做比较
        for i in range(min_len):
            min_type = None
            min_val = float('inf')
            for t in compare_types:
                if i < len(arrays.get(t, [])):
                    val = arrays[t][i]
                    if val < min_val:
                        min_val = val
                        min_type = t
            if min_type and baseline[i] > 0:
                diff = (baseline[i] - min_val) / baseline[i]
                if diff > 0.035:
                    results.append((operation, i, sizes[i], baseline[i], min_val, min_type, diff))

    return results


def save_results(results, output_file="qz.txt"):
    """保存 size 非零的结果到文件（追加模式）"""
    with open(output_file, "a") as f:  # 修改为 "a" 模式以追加内容
        for operation, idx, size, base, other, t, diff in results:
            if size != 0:  # 过滤掉 Size=0 的条目
                f.write(
                    f"Operation: {operation}, Size={size} bytes, "
                    f"baseline={base:.2f}, min_other={other:.2f} "
                    f"({t[1]},{t[2]}), diff={diff*100:.2f}%\n"
                )


if __name__ == "__main__":
    arrays, sizes = process_data("shuju.txt")
    results = analyze_results(arrays, sizes)
    save_results(results)
    print(f"找到 {len(results)} 条有效比较结果，已保存到 qz.txt")
    
    if not results:
        print("\n可能原因:")
        print("1. 没有算法组合比Default快3%以上")
        print("2. 数据分组不一致导致比较失败")
        print("3. 数据文件中缺少某些操作类型或算法组合的数据")
    else:
        # 按操作类型统计结果
        operation_stats = {}
        for result in results:
            operation = result[0]
            operation_stats[operation] = operation_stats.get(operation, 0) + 1
        
        print("\n各操作类型有效比较结果统计:")
        for op, count in operation_stats.items():
            print(f"  {op}: {count} 条")