all

0a8cfdda · zhangwq5 · 7052e81b · 0a8cfdda · 0a8cfdda · 0a8cfdda
Commit 0a8cfdda authored Sep 12, 2025 by zhangwq5
10 changed files
--- a/README.md
+++ b/README.md
-# Graphormer
+# Graphormer_pytorch
-Graphormer 是一个深度学习软件包，它能让研究人员和开发人员为分子建模任务训练自定义模型。
+<div align=center>
\ No newline at end of file
+    <img src="./doc/GF.png"/>
+</div>
+Graphormer 是一个深度学习软件包，它能让研究人员和开发人员为分子建模任务训练自定义模型。其旨在加速分子科学领域人工智能的研究与应用，例如材料发现、药物发现等，[项目网站](https://www.microsoft.com/en-us/research/project/graphormer/)。
+Graphormer 的高级预训练版本仅在以下平台提供：[Azure Quantum Elements](https://quantum.microsoft.com/en-us/our-story/quantum-elements-overview)。
+关于Graphormer的更多信息请前往[源码仓库](https://github.com/microsoft/Graphormer)或[用户手册]()
+## 环境配置
+### 硬件需求
+DCU型号：K100_AI,节点数量：1台,卡数：2张。
+### Docker
+```bash
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk24.04.2-py3.10
+docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
+cd /your_code_path/graphormer
+# 克隆并安装 fairseq
+git clone https://github.com/facebookresearch/fairseq.git
+git checkout 98ebe4f
+pip install -e .
+python setup.py build_ext --inplace
+# 安装其他库
+pip install -r requirements.txt
+# 克隆 graphormer
+git clone https://github.com/microsoft/Graphormer.git
+# 替换文件
+cp -f ./algos.pyx ./Graphormer/graphormer/data/algos.pyx
+cp -f ./pyg_dataset_lookup_table.py ./Graphormer/graphormer/data/pyg_datasets/pyg_dataset_lookup_table.py
+```
+## 数据集
+[示例数据集ZINC下载地址](https://uc2b1e201a894fcdcefc45447d3d.dl.dropboxusercontent.com/cd/0/get/CxApk-pOjXqveTD0AlvvU9WEEA2_kVEX8ekJVJe1vichmkjzq-LY7rmslrJFughtnKezOb8HyijbDxpMWW0mc-B4TtGE2UofmHz5-2_jnkaFWigtF6opQQvQiwMge5JCCyCMUsPJV__YUagJZOFRucbe/file?dl=1#)
+## 训练
+```bash
+cd ./Graphormer/examples/property_prediction
+mkdir ./dataset/raw
+# 下载ZINC数据集并解压，将文件cp到./dataset/raw下
+unzip molecules.zip
+cp ./molecules/atom_dict.pickle ./dataset/raw
+cp ./molecules/bond_dict.pickle ./dataset/raw
+cp ./molecules/train.pickle ./dataset/raw
+cp ./molecules/test.pickle ./dataset/raw
+cp ./molecules/val.pickle ./dataset/raw
+# 修改fairseq安装路径，将训练命令复制粘贴到./zinc.sh
+export PYTHONPATH=/you_path_of/fairseq:$PYTHONPATH
+fairseq-train \
+--user-dir ../../graphormer \
+--num-workers 0 \
+--find-unused-parameters \
+--dataset-name zinc \
+--dataset-source pyg \
+--task graph_prediction \
+--criterion l1_loss \
+--arch graphormer_slim \
+--num-classes 1 \
+--attention-dropout 0.1 --act-dropout 0.1 --dropout 0.0 \
+--optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-8 --clip-norm 5.0 --weight-decay 0.01 \
+--lr-scheduler polynomial_decay --power 1 --warmup-updates 60000 --total-num-update 400000 \
+--lr 2e-4 --end-learning-rate 1e-9 \
+--batch-size 64 \
+--fp16 \
+--data-buffer-size 20 \
+--encoder-layers 12 \
+--encoder-embed-dim 80 \
+--encoder-ffn-embed-dim 80 \
+--encoder-attention-heads 8 \
+--max-epoch 2 \
+--save-dir ./ckpts
+# 启动训练, 第一次启动时会花费一些时间用于数据集转换，转换后的训练数据存放在./property_prediction/dataset/full/processed目录下
+# 权重文件会保存在./property_prediction/ckpts下
+bash zinc.sh
+```
+## 推理
+无
+## result
+详见/graphormer/res文件夹
+## 应用场景
+### 算法类别
+`训练微调`
+### 热点应用行业
+`金融,教育,政府,科研,制造,能源,交通`
+## 源码仓库及问题反馈
+- https://github.com/microsoft/Graphormer
+## 参考资料
+- https://github.com/ibm-granite/granite-speech-models
\ No newline at end of file
--- a/algos.pyx
+++ b/algos.pyx
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import cython
+from cython.parallel cimport prange, parallel
+cimport numpy
+import numpy
+def floyd_warshall(adjacency_matrix):
+    (nrows, ncols) = adjacency_matrix.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+    adj_mat_copy = adjacency_matrix.astype(numpy.int64, order='C', casting='safe', copy=True)
+    assert adj_mat_copy.flags['C_CONTIGUOUS']
+    cdef numpy.ndarray[long, ndim=2, mode='c'] M = adj_mat_copy
+    cdef numpy.ndarray[long, ndim=2, mode='c'] path = -1 * numpy.ones([n, n], dtype=numpy.int64)
+    cdef unsigned int i, j, k
+    cdef long M_ij, M_ik, cost_ikkj
+    cdef long* M_ptr = &M[0,0]
+    cdef long* M_i_ptr
+    cdef long* M_k_ptr
+    # set unreachable nodes distance to 510
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                M[i][j] = 0
+            elif M[i][j] == 0:
+                M[i][j] = 510
+    # floyed algo
+    for k in range(n):
+        M_k_ptr = M_ptr + n*k
+        for i in range(n):
+            M_i_ptr = M_ptr + n*i
+            M_ik = M_i_ptr[k]
+            for j in range(n):
+                cost_ikkj = M_ik + M_k_ptr[j]
+                M_ij = M_i_ptr[j]
+                if M_ij > cost_ikkj:
+                    M_i_ptr[j] = cost_ikkj
+                    path[i][j] = k
+    # set unreachable path to 510
+    for i in range(n):
+        for j in range(n):
+            if M[i][j] >= 510:
+                path[i][j] = 510
+                M[i][j] = 510
+    return M, path
+def get_all_edges(path, i, j):
+    cdef int k = path[i][j]
+    if k == -1:
+        return []
+    else:
+        return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
+def gen_edge_input(max_dist, path, edge_feat):
+    (nrows, ncols) = path.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+    cdef unsigned int max_dist_copy = max_dist
+    path_copy = path.astype(numpy.int64, order='C', casting='safe', copy=True)
+    edge_feat_copy = edge_feat.astype(numpy.int64, order='C', casting='safe', copy=True)
+    assert path_copy.flags['C_CONTIGUOUS']
+    assert edge_feat_copy.flags['C_CONTIGUOUS']
+    cdef numpy.ndarray[long, ndim=4, mode='c'] edge_fea_all = -1 * numpy.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=numpy.int64)
+    cdef unsigned int i, j, k, num_path, cur
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                continue
+            if path_copy[i][j] == 510:
+                continue
+            path = [i] + get_all_edges(path_copy, i, j) + [j]
+            num_path = len(path) - 1
+            for k in range(num_path):
+                edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :]
+    return edge_fea_all
--- a/doc/GF.png
+++ b/doc/GF.png
--- a/pyg_dataset_lookup_table.py
+++ b/pyg_dataset_lookup_table.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from typing import Optional
+from torch_geometric.datasets import *
+from torch_geometric.data import Dataset
+from .pyg_dataset import GraphormerPYGDataset
+import torch.distributed as dist
+import os.path as osp
+import pickle
+import torch
+from torch_geometric.datasets import ZINC 
+from torch_geometric.data import Data
+class MyQM7b(QM7b):
+    def download(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            super(MyQM7b, self).download()
+        if dist.is_initialized():
+            dist.barrier()
+    def process(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            super(MyQM7b, self).process()
+        if dist.is_initialized():
+            dist.barrier()
+class MyQM9(QM9):
+    def download(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            super(MyQM9, self).download()
+        if dist.is_initialized():
+            dist.barrier()
+    def process(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            super(MyQM9, self).process()
+        if dist.is_initialized():
+            dist.barrier()
+class MyZINC(ZINC):
+    def download(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            pass 
+        if dist.is_initialized():
+            dist.barrier()
+    def process(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            for i, split in enumerate(['train', 'val', 'test']):
+                input_path = osp.join(self.raw_dir, f'{split}.pickle')
+                with open(input_path, 'rb') as f:
+                    graphs = pickle.load(f)
+                    data_list = []
+                    for g in graphs:
+                        x = g['atom_type'].to(torch.long).view(-1, 1)
+                        bond_info = g['bond_type']
+                        y = g['logP_SA_cycle_normalized'].clone().detach().view(1, -1).to(torch.float)
+                        edge_index = bond_info[:, :2].t().contiguous().to(torch.long)
+                        edge_attr = bond_info[:, 2].to(torch.long)
+                        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
+                        data.num_nodes = len(x)
+                        data_list.append(data)
+                if self.pre_filter is not None:
+                    data_list = [d for d in data_list if self.pre_filter(d)]
+                if self.pre_transform is not None:
+                    data_list = [self.pre_transform(d) for d in data_list]
+                data, slices = self.collate(data_list)
+                torch.save((data, slices), self.processed_paths[i])
+        if dist.is_initialized():
+            dist.barrier()
+class MyMoleculeNet(MoleculeNet):
+    def download(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            super(MyMoleculeNet, self).download()
+        if dist.is_initialized():
+            dist.barrier()
+    def process(self):
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            super(MyMoleculeNet, self).process()
+        if dist.is_initialized():
+            dist.barrier()
+class PYGDatasetLookupTable:
+    @staticmethod
+    def GetPYGDataset(dataset_spec: str, seed: int) -> Optional[Dataset]:
+        split_result = dataset_spec.split(":")
+        if len(split_result) == 2:
+            name, params = split_result[0], split_result[1]
+            params = params.split(",")
+        elif len(split_result) == 1:
+            name = dataset_spec
+            params = []
+        inner_dataset = None
+        num_class = 1
+        train_set = None
+        valid_set = None
+        test_set = None
+        root = "dataset"
+        if name == "qm7b":
+            inner_dataset = MyQM7b(root=root)
+        elif name == "qm9":
+            inner_dataset = MyQM9(root=root)
+        elif name == "zinc":
+            inner_dataset = MyZINC(root=root)
+            train_set = MyZINC(root=root, split="train")
+            valid_set = MyZINC(root=root, split="val")
+            test_set = MyZINC(root=root, split="test")
+        elif name == "moleculenet":
+            nm = None
+            for param in params:
+                name, value = param.split("=")
+                if name == "name":
+                    nm = value
+            inner_dataset = MyMoleculeNet(root=root, name=nm)
+        else:
+            raise ValueError(f"Unknown dataset name {name} for pyg source.")
+        if train_set is not None:
+            return GraphormerPYGDataset(
+                    None,
+                    seed,
+                    None,
+                    None,
+                    None,
+                    train_set,
+                    valid_set,
+                    test_set,
+                )
+        else:
+            return (
+                None
+                if inner_dataset is None
+                else GraphormerPYGDataset(inner_dataset, seed)
+            )
--- a/requirements.txt
+++ b/requirements.txt
+# 由于所使用的pytorch版本为torch==2.1.0+das.opt1.dtk24042，因此所安装的其他库版本会与github官方仓库会有些许不同
+lmdb
+torch-geometric==2.0.0
+tensorboardX==2.4.1
+ogb==1.3.2
+rdkit-pypi==2021.9.3
+dgl==0.9.0
+numpy==1.23.5
+# https://download.sourcefind.cn:65024/directlink/4/pytorch/DAS1.2/torch-2.1.0+das.opt1.dtk24042-cp310-cp310-manylinux_2_28_x86_64.whl
+https://download.sourcefind.cn:65024/directlink/4/torch_cluster/DAS1.2/torch_cluster-1.6.0+das.opt1.dtk24042-cp310-cp310-manylinux_2_28_x86_64.whl
+https://download.sourcefind.cn:65024/directlink/4/torch_scatter/DAS1.2/torch_scatter-2.1.0+das.opt1.dtk24042-cp310-cp310-manylinux_2_28_x86_64.whl
+https://download.sourcefind.cn:65024/directlink/4/torch_sparse/DAS1.2/torch_sparse-0.6.16+das.opt1.dtk24042-cp310-cp310-manylinux_2_28_x86_64.whl
+https://download.sourcefind.cn:65024/directlink/4/torch_spline_conv/DAS1.2/torch_spline_conv-1.2.1+das.opt1.dtk24042-cp310-cp310-manylinux_2_28_x86_64.whl
\ No newline at end of file
--- a/res/compare_ckpts.py
+++ b/res/compare_ckpts.py
+# -*- coding: utf-8 -*-
+"""
+一个用于比较两个 PyTorch checkpoint (.pt 或 .ckpt) 文件中模型权重的脚本。
+它会逐层比较权重，并根据预设的“平均绝对差异”阈值来判断是否“过关”。
+"""
+import torch
+from collections import OrderedDict
+# ==============================================================================
+# 1. 配置区域: 文件路径、模型权重 Key 和判断阈值
+# ==============================================================================
+CKPT_PATH_1 = '/home/zwq/project/shangchaun/external/graphormer_pytorch/res/res_of_A800/checkpoint1.pt'
+CKPT_PATH_2 = '/home/zwq/project/shangchaun/external/graphormer_pytorch/res/res_of_K100AI/checkpoint1.pt'
+# 我们已经通过探查得知，模型权重存储在 'model' 这个键下
+MODEL_WEIGHTS_KEY = 'model'
+# !! 核心判断标准 !!
+# 设置平均绝对差异的阈值，如果所有层的差异都小于此值，则认为“过关”
+MEAN_ABS_DIFF_THRESHOLD = 0.02
+# ==============================================================================
+def extract_state_dict(checkpoint, model_key):
+    """从加载的 checkpoint 对象中提取 state_dict。"""
+    if not isinstance(checkpoint, dict):
+         raise TypeError(f"Checkpoint 文件加载后不是一个字典，而是一个 {type(checkpoint)}。")
+    if model_key in checkpoint:
+        return checkpoint[model_key]
+    else:
+        keys_found = list(checkpoint.keys())
+        raise KeyError(
+            f"在 checkpoint 中找不到指定的键 '{model_key}'。\n"
+            f"文件中实际存在的键是: {keys_found}"
+        )
+def normalize_keys(state_dict):
+    """移除常见的 state_dict key 前缀，如 'module.'。"""
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('module.'):
+            name = k[7:]  # 移除 'module.'
+        else:
+            name = k
+        new_state_dict[name] = v
+    return new_state_dict
+def compare_checkpoints(ckpt_path1, ckpt_path2, model_key, threshold):
+    """加载并比较两个 checkpoint 文件的主函数。"""
+    print(f"[*] 正在加载 Checkpoint 1: {ckpt_path1}")
+    ckpt1 = torch.load(ckpt_path1, map_location='cpu')
+    print(f"[*] 正在加载 Checkpoint 2: {ckpt_path2}")
+    ckpt2 = torch.load(ckpt_path2, map_location='cpu')
+    print(f"\n[*] 正在从键 '{model_key}' 中提取并标准化 state_dict...")
+    sd1 = normalize_keys(extract_state_dict(ckpt1, model_key))
+    sd2 = normalize_keys(extract_state_dict(ckpt2, model_key))
+    keys1, keys2 = set(sd1.keys()), set(sd2.keys())
+    common_keys = sorted(list(keys1.intersection(keys2)))
+    unique_to_1, unique_to_2 = sorted(list(keys1 - keys2)), sorted(list(keys2 - keys1))
+    print("\n" + "="*60)
+    print("      层名称比较摘要 (Layer Name Comparison Summary)")
+    print("="*60)
+    print(f"总层数 (文件1): {len(keys1)}")
+    print(f"总层数 (文件2): {len(keys2)}")
+    print(f"共有层数: {len(common_keys)}")
+    if unique_to_1: print(f"文件1独有层数: {len(unique_to_1)}")
+    if unique_to_2: print(f"文件2独有层数: {len(unique_to_2)}")
+    print("\n" + "="*60)
+    print("      共有层权重差异详细分析 (Shared Layer Weight-Diff Analysis)")
+    print(f"      - 阈值 (Threshold for Mean Abs Diff): {threshold}")
+    print("="*60)
+    failing_layers = []
+    for key in common_keys:
+        tensor1, tensor2 = sd1[key], sd2[key]
+        if tensor1.shape != tensor2.shape:
+            print(f"层: {key} - [形状不匹配!] Shape Mismatch! {tensor1.shape} vs {tensor2.shape}")
+            failing_layers.append((key, float('inf'), "形状不匹配")) # 标记为失败
+            continue
+        if torch.equal(tensor1, tensor2):
+            continue  # 完全相同则跳过，保持输出简洁
+        abs_diff = torch.abs(tensor1.float() - tensor2.float())
+        mean_abs_diff = abs_diff.mean().item()
+        # 核心检查：平均绝对差异是否超过阈值
+        if mean_abs_diff > threshold:
+            status = f"❌ [不通过] (>{threshold})"
+            failing_layers.append((key, mean_abs_diff, "超过阈值"))
+        else:
+            status = f"✅ [通过] (<={threshold})"
+        print(f"层: {key}")
+        print(f"  - 平均绝对差 (Mean Abs Diff): {mean_abs_diff:.8f} --- {status}")
+    print("\n" + "="*60)
+    print("      最终总结 (Final Conclusion)")
+    print("="*60)
+    # 检查结构是否完全一致
+    if unique_to_1 or unique_to_2:
+        print("警告: 两个模型的层结构不完全一致，存在独有层。")
+        print("  - 文件1 独有层:", unique_to_1 if unique_to_1 else "无")
+        print("  - 文件2 独有层:", unique_to_2 if unique_to_2 else "无")
+        print("-" * 20)
+    # 根据 failing_layers 列表给出最终结论
+    if not failing_layers:
+        print(f"✅ 过关 (PASS): 所有共有层的平均绝对差异都在阈值 {threshold} 之内。")
+    else:
+        print(f"❌ 不通过 (FAIL): 发现 {len(failing_layers)} 个层的差异不满足要求。")
+        print("\n详细信息如下:")
+        for layer_name, diff_value, reason in failing_layers:
+            if reason == "形状不匹配":
+                print(f"  - 层: {layer_name}, 原因: {reason}")
+            else:
+                print(f"  - 层: {layer_name}, 平均绝对差: {diff_value:.8f} (原因: {reason})")
+if __name__ == '__main__':
+    try:
+        compare_checkpoints(CKPT_PATH_1, CKPT_PATH_2, MODEL_WEIGHTS_KEY, MEAN_ABS_DIFF_THRESHOLD)
+    except Exception as e:
+        print(f"\n[程序执行出错]: {e}")
\ No newline at end of file
--- a/res/res_of_A800/checkpoint1.pt
+++ b/res/res_of_A800/checkpoint1.pt
--- a/res/res_of_A800/checkpoint2.pt
+++ b/res/res_of_A800/checkpoint2.pt
--- a/res/res_of_K100AI/checkpoint1.pt
+++ b/res/res_of_K100AI/checkpoint1.pt
--- a/res/res_of_K100AI/checkpoint2.pt
+++ b/res/res_of_K100AI/checkpoint2.pt