KGDataset.py

import os
import numpy as np

def _download_and_extract(url, path, filename):
    import shutil, zipfile
    import requests

    fn = os.path.join(path, filename)

    while True:
        try:
            with zipfile.ZipFile(fn) as zf:
                zf.extractall(path)
            print('Unzip finished.')
            break
        except Exception:
            os.makedirs(path, exist_ok=True)
            f_remote = requests.get(url, stream=True)
            sz = f_remote.headers.get('content-length')
            assert f_remote.status_code == 200, 'fail to open {}'.format(url)
            with open(fn, 'wb') as writer:
                for chunk in f_remote.iter_content(chunk_size=1024*1024):
                    writer.write(chunk)
            print('Download finished. Unzipping the file...')

def _get_id(dict, key):
    id = dict.get(key, None)
    if id is None:
        id = len(dict)
        dict[key] = id
    return id

def _parse_srd_format(format):
    if format == "hrt":
        return [0, 1, 2]
    if format == "htr":
        return [0, 2, 1]
    if format == "rht":
        return [1, 0, 2]
    if format == "rth":
        return [2, 0, 1]
    if format == "thr":
        return [1, 2, 0]
    if format == "trh":
        return [2, 1, 0]

class KGDataset:
    '''Load a knowledge graph

    The folder with a knowledge graph has five files:
    * entities stores the mapping between entity Id and entity name.
    * relations stores the mapping between relation Id and relation name.
    * train stores the triples in the training set.
    * valid stores the triples in the validation set.
    * test stores the triples in the test set.

    The mapping between entity (relation) Id and entity (relation) name is stored as 'id\tname'.

    The triples are stored as 'head_name\trelation_name\ttail_name'.
    '''
    def __init__(self, entity_path, relation_path,
                 train_path, valid_path=None, test_path=None,
                 format=[0,1,2], read_triple=True, only_train=False,
                 skip_first_line=False):
        self.entity2id, self.n_entities = self.read_entity(entity_path)
        self.relation2id, self.n_relations = self.read_relation(relation_path)

        if read_triple == True:
            self.train = self.read_triple(train_path, "train", skip_first_line, format)
            if only_train == False:
                self.valid = self.read_triple(valid_path, "valid", skip_first_line, format)
                self.test = self.read_triple(test_path, "test", skip_first_line, format)

    def read_entity(self, entity_path):
        with open(entity_path) as f:
            entity2id = {}
            for line in f:
                eid, entity = line.strip().split('\t')
                entity2id[entity] = int(eid)

        return entity2id, len(entity2id)

    def read_relation(self, relation_path):
        with open(relation_path) as f:
            relation2id = {}
            for line in f:
                rid, relation = line.strip().split('\t')
                relation2id[relation] = int(rid)

        return relation2id, len(relation2id)

    def read_triple(self, path, mode, skip_first_line=False, format=[0,1,2]):
        # mode: train/valid/test
        if path is None:
            return None

        heads = []
        tails = []
        rels = []
        with open(path) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                triple = line.strip().split('\t')
                h, r, t = triple[format[0]], triple[format[1]], triple[format[2]]
                heads.append(self.entity2id[h])
                rels.append(self.relation2id[r])
                tails.append(self.entity2id[t])
        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)

        return (heads, rels, tails)

class KGDatasetFB15k(KGDataset):
    '''Load a knowledge graph FB15k

    The FB15k dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='FB15k', read_triple=True, only_train=False):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetFB15k, self).__init__(os.path.join(self.path, 'entities.dict'),
                                             os.path.join(self.path, 'relations.dict'),
                                             os.path.join(self.path, 'train.txt'),
                                             os.path.join(self.path, 'valid.txt'),
                                             os.path.join(self.path, 'test.txt'),
                                             read_triple=read_triple,
                                             only_train=only_train)

class KGDatasetFB15k237(KGDataset):
    '''Load a knowledge graph FB15k-237

    The FB15k-237 dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='FB15k-237', read_triple=True, only_train=False):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetFB15k237, self).__init__(os.path.join(self.path, 'entities.dict'),
                                                os.path.join(self.path, 'relations.dict'),
                                                os.path.join(self.path, 'train.txt'),
                                                os.path.join(self.path, 'valid.txt'),
                                                os.path.join(self.path, 'test.txt'),
                                                read_triple=read_triple,
                                                only_train=only_train)

class KGDatasetWN18(KGDataset):
    '''Load a knowledge graph wn18

    The wn18 dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='wn18', read_triple=True, only_train=False):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetWN18, self).__init__(os.path.join(self.path, 'entities.dict'),
                                            os.path.join(self.path, 'relations.dict'),
                                            os.path.join(self.path, 'train.txt'),
                                            os.path.join(self.path, 'valid.txt'),
                                            os.path.join(self.path, 'test.txt'),
                                            read_triple=read_triple,
                                            only_train=only_train)

class KGDatasetWN18rr(KGDataset):
    '''Load a knowledge graph wn18rr

    The wn18rr dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='wn18rr', read_triple=True, only_train=False):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetWN18rr, self).__init__(os.path.join(self.path, 'entities.dict'),
                                              os.path.join(self.path, 'relations.dict'),
                                              os.path.join(self.path, 'train.txt'),
                                              os.path.join(self.path, 'valid.txt'),
                                              os.path.join(self.path, 'test.txt'),
                                              read_triple=read_triple,
                                              only_train=only_train)

class KGDatasetFreebase(KGDataset):
    '''Load a knowledge graph Full Freebase

    The Freebase dataset has five files:
    * entity2id.txt stores the mapping between entity name and entity Id.
    * relation2id.txt stores the mapping between relation name relation Id.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='Freebase', read_triple=True, only_train=False):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, '{}.zip'.format(name))
        self.path = os.path.join(path, name)

        super(KGDatasetFreebase, self).__init__(os.path.join(self.path, 'entity2id.txt'),
                                                os.path.join(self.path, 'relation2id.txt'),
                                                os.path.join(self.path, 'train.txt'),
                                                os.path.join(self.path, 'valid.txt'),
                                                os.path.join(self.path, 'test.txt'),
                                                read_triple=read_triple,
                                                only_train=only_train)

    def read_entity(self, entity_path):
        with open(entity_path) as f_ent:
            n_entities = int(f_ent.readline()[:-1])
        return None, n_entities

    def read_relation(self, relation_path):
        with open(relation_path) as f_rel:
            n_relations = int(f_rel.readline()[:-1])
        return None, n_relations

    def read_triple(self, path, mode, skip_first_line=False):
        heads = []
        tails = []
        rels = []
        print('Reading {} triples....'.format(mode))
        with open(path) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                h, t, r = line.strip().split('\t')
                heads.append(int(h))
                tails.append(int(t))
                rels.append(int(r))
        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)
        print('Finished. Read {} {} triples.'.format(len(heads), mode))
        return (heads, rels, tails)

class KGDatasetUDDRaw(KGDataset):
    '''Load a knowledge graph user defined dataset

    The user defined dataset has five files:
    * entities stores the mapping between entity name and entity Id.
    * relations stores the mapping between relation name relation Id.
    * train stores the triples in the training set. In format [src_name, rel_name, dst_name]
    * valid stores the triples in the validation set. In format [src_name, rel_name, dst_name]
    * test stores the triples in the test set. In format [src_name, rel_name, dst_name]

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name, files, format):
        self.name = name
        for f in files:
            assert os.path.exists(os.path.join(path, f)), \
                'File {} now exist in {}'.format(f, path)

        assert len(format) == 3
        format = _parse_srd_format(format)
        self.load_entity_relation(path, files, format)

        # Only train set is provided
        if len(files) == 1:
            super(KGDatasetUDDRaw, self).__init__("entities.tsv",
                                                  "relation.tsv",
                                                  os.path.join(path, files[0]),
                                                  format=format,
                                                  read_triple=True,
                                                  only_train=True)
        # Train, validation and test set are provided
        if len(files) == 3:
            super(KGDatasetUDDRaw, self).__init__("entities.tsv",
                                                  "relation.tsv",
                                                  os.path.join(path, files[0]),
                                                  os.path.join(path, files[1]),
                                                  os.path.join(path, files[2]),
                                                  format=format,
                                                  read_triple=True,
                                                  only_train=False)

    def load_entity_relation(self, path, files, format):
        entity_map = {}
        rel_map = {}
        for fi in files:
            with open(os.path.join(path, fi)) as f:
                for line in f:
                    triple = line.strip().split('\t')
                    src, rel, dst = triple[format[0]], triple[format[1]], triple[format[2]]
                    src_id = _get_id(entity_map, src)
                    dst_id = _get_id(entity_map, dst)
                    rel_id = _get_id(rel_map, rel)

        entities = ["{}\t{}\n".format(key, val) for key, val in entity_map.items()]
        with open(os.path.join(path, "entities.tsv"), "w+") as f:
            f.writelines(entities)
        self.entity2id = entity_map
        self.n_entities = len(entities)

        relations = ["{}\t{}\n".format(key, val) for key, val in rel_map.items()]
        with open(os.path.join(path, "relations.tsv"), "w+") as f:
            f.writelines(relations)
        self.relation2id = rel_map
        self.n_relations = len(relations)

    def read_entity(self, entity_path):
        return self.entity2id, self.n_entities
    
    def read_relation(self, relation_path):
        return self.relation2id, self.n_relations

class KGDatasetUDD(KGDataset):
    '''Load a knowledge graph user defined dataset

    The user defined dataset has five files:
    * entities stores the mapping between entity name and entity Id.
    * relations stores the mapping between relation name relation Id.
    * train stores the triples in the training set. In format [src_id, rel_id, dst_id]
    * valid stores the triples in the validation set. In format [src_id, rel_id, dst_id]
    * test stores the triples in the test set. In format [src_id, rel_id, dst_id]

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name, files, format, read_triple=True, only_train=False):
        self.name = name
        for f in files:
            assert os.path.exists(os.path.join(path, f)), \
                'File {} now exist in {}'.format(f, path)

        format = _parse_srd_format(format)
        if len(files) == 3:
            super(KGDatasetUDD, self).__init__(os.path.join(path, files[0]),
                                               os.path.join(path, files[1]),
                                               os.path.join(path, files[2]),
                                               os.path.join(path, None),
                                               os.path.join(path, None),
                                               format=format,
                                               read_triple=read_triple,
                                               only_train=only_train)
        if len(files) == 5:
            super(KGDatasetUDD, self).__init__(os.path.join(path, files[0]),
                                               os.path.join(path, files[1]),
                                               os.path.join(path, files[2]),
                                               os.path.join(path, files[3]),
                                               os.path.join(path, files[4]),
                                               format=format,
                                               read_triple=read_triple,
                                               only_train=only_train)

    def read_entity(self, entity_path):
        n_entities = 0
        with open(entity_path) as f_ent:
            for line in f_ent:
                n_entities += 1
        return None, n_entities

    def read_relation(self, relation_path):
        n_relations = 0
        with open(relation_path) as f_rel:
            for line in f_rel:
                n_relations += 1
        return None, n_relations

    def read_triple(self, path, mode, skip_first_line=False, format=[0,1,2]):
        heads = []
        tails = []
        rels = []
        print('Reading {} triples....'.format(mode))
        with open(path) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                triple = line.strip().split('\t')
                h, r, t = triple[format[0]], triple[format[1]], triple[format[2]]
                heads.append(int(h))
                tails.append(int(t))
                rels.append(int(r))
        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)
        print('Finished. Read {} {} triples.'.format(len(heads), mode))
        return (heads, rels, tails)

def get_dataset(data_path, data_name, format_str, files=None):
    if format_str == 'built_in':
        if data_name == 'Freebase':
            dataset = KGDatasetFreebase(data_path)
        elif data_name == 'FB15k':
            dataset = KGDatasetFB15k(data_path)
        elif data_name == 'FB15k-237':
            dataset = KGDatasetFB15k237(data_path)
        elif data_name == 'wn18':
            dataset = KGDatasetWN18(data_path)
        elif data_name == 'wn18rr':
            dataset = KGDatasetWN18rr(data_path)
        else: 
            assert False, "Unknown dataset {}".format(data_name)
    elif format_str.startswith('raw_udd'):
        # user defined dataset
        format = format_str[8:]
        dataset = KGDatasetUDDRaw(data_path, data_name, files, format)
    elif format_str.startswith('udd'):
        # user defined dataset
        format = format_str[4:]
        dataset = KGDatasetUDD(data_path, data_name, files, format)
    else:
        assert False, "Unknown format {}".format(format_str)

    return dataset


def get_partition_dataset(data_path, data_name, format_str, part_id):
    part_name = os.path.join(data_name, 'part_'+str(part_id))
    if format_str == 'built_in':
        if data_name == 'Freebase':
            dataset = KGDatasetFreebase(data_path, part_name, read_triple=True, only_train=True)
        elif data_name == 'FB15k':
            dataset = KGDatasetFB15k(data_path, part_name, read_triple=True, only_train=True)
        elif data_name == 'FB15k-237':
            dataset = KGDatasetFB15k237(data_path, part_name, read_triple=True, only_train=True)
        elif data_name == 'wn18':
            dataset = KGDatasetWN18(data_path, part_name, read_triple=True, only_train=True)
        elif data_name == 'wn18rr':
            dataset = KGDatasetWN18rr(data_path, part_name, read_triple=True, only_train=True)
        else:
            assert False, "Unknown dataset {}".format(data_name)
    elif format_str == 'raw_udd':
        # user defined dataset
        assert False, "When using partitioned dataset, we assume dataset will not be raw"
    elif format_str == 'udd':
        # user defined dataset
        format = format_str[4:]
        dataset = KGDatasetUDD(data_path, data_name, files, format, read_triple=True, only_train=True)
    else:
        assert False, "Unknown format {}".format(format_str)

    path = os.path.join(data_path, part_name)

    partition_book = []
    with open(os.path.join(path, 'partition_book.txt')) as f:
        for line in f:
            partition_book.append(int(line))

    local_to_global = []
    with open(os.path.join(path, 'local_to_global.txt')) as f:
        for line in f:
            local_to_global.append(int(line))

    return dataset, partition_book, local_to_global

def get_server_partition_dataset(data_path, data_name, format_str, part_id):
    part_name = os.path.join(data_name, 'part_'+str(part_id))

    if format_str == 'built_in':
        if data_name == 'Freebase':
            dataset = KGDatasetFreebase(data_path, part_name, read_triple=False, only_train=True)
        elif data_name == 'FB15k':
            dataset = KGDatasetFB15k(data_path, part_name, read_triple=False, only_train=True)
        elif data_name == 'FB15k-237':
            dataset = KGDatasetFB15k237(data_path, part_name, read_triple=False, only_train=True)
        elif data_name == 'wn18':
            dataset = KGDatasetWN18(data_path, part_name, read_triple=False, only_train=True)
        elif data_name == 'wn18rr':
            dataset = KGDatasetWN18rr(data_path, part_name, read_triple=False, only_train=True)
        else:
            assert False, "Unknown dataset {}".format(data_name)
    elif format_str == 'raw_udd':
        # user defined dataset
        assert False, "When using partitioned dataset, we assume dataset will not be raw"
    elif format_str == 'udd':
        # user defined dataset
        format = format_str[4:]
        dataset = KGDatasetUDD(data_path, data_name, files, format, read_triple=False, only_train=True)
    else:
        assert False, "Unknown format {}".format(format_str)

    path = os.path.join(data_path, part_name)

    n_entities = len(open(os.path.join(path, 'partition_book.txt')).readlines())

    local_to_global = []
    with open(os.path.join(path, 'local_to_global.txt')) as f:
        for line in f:
            local_to_global.append(int(line))

    global_to_local = [0] * n_entities
    for i in range(len(local_to_global)):
        global_id = local_to_global[i]
        global_to_local[global_id] = i

    local_to_global = None

    return global_to_local, dataset