KGDataset.py

# -*- coding: utf-8 -*-
#
# setup.py
#
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import numpy as np

def _download_and_extract(url, path, filename):
    import shutil, zipfile
    import requests

    fn = os.path.join(path, filename)

    while True:
        try:
            with zipfile.ZipFile(fn) as zf:
                zf.extractall(path)
            print('Unzip finished.')
            break
        except Exception:
            os.makedirs(path, exist_ok=True)
            f_remote = requests.get(url, stream=True)
            sz = f_remote.headers.get('content-length')
            assert f_remote.status_code == 200, 'fail to open {}'.format(url)
            with open(fn, 'wb') as writer:
                for chunk in f_remote.iter_content(chunk_size=1024*1024):
                    writer.write(chunk)
            print('Download finished. Unzipping the file...')

def _get_id(dict, key):
    id = dict.get(key, None)
    if id is None:
        id = len(dict)
        dict[key] = id
    return id

def _parse_srd_format(format):
    if format == "hrt":
        return [0, 1, 2]
    if format == "htr":
        return [0, 2, 1]
    if format == "rht":
        return [1, 0, 2]
    if format == "rth":
        return [2, 0, 1]
    if format == "thr":
        return [1, 2, 0]
    if format == "trh":
        return [2, 1, 0]

def _file_line(path):
    with open(path) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

class KGDataset:
    '''Load a knowledge graph

    The folder with a knowledge graph has five files:
    * entities stores the mapping between entity Id and entity name.
    * relations stores the mapping between relation Id and relation name.
    * train stores the triples in the training set.
    * valid stores the triples in the validation set.
    * test stores the triples in the test set.

    The mapping between entity (relation) Id and entity (relation) name is stored as 'id\tname'.

    The triples are stored as 'head_name\trelation_name\ttail_name'.
    '''
    def __init__(self, entity_path, relation_path, train_path, 
                 valid_path=None, test_path=None, format=[0,1,2], skip_first_line=False):

        self.entity2id, self.n_entities = self.read_entity(entity_path)
        self.relation2id, self.n_relations = self.read_relation(relation_path)
        self.train = self.read_triple(train_path, "train", skip_first_line, format)
        if valid_path is not None:
            self.valid = self.read_triple(valid_path, "valid", skip_first_line, format)
        if test_path is not None:
            self.test = self.read_triple(test_path, "test", skip_first_line, format)

    def read_entity(self, entity_path):
        with open(entity_path) as f:
            entity2id = {}
            for line in f:
                eid, entity = line.strip().split('\t')
                entity2id[entity] = int(eid)

        return entity2id, len(entity2id)

    def read_relation(self, relation_path):
        with open(relation_path) as f:
            relation2id = {}
            for line in f:
                rid, relation = line.strip().split('\t')
                relation2id[relation] = int(rid)

        return relation2id, len(relation2id)

    def read_triple(self, path, mode, skip_first_line=False, format=[0,1,2]):
        # mode: train/valid/test
        if path is None:
            return None

        print('Reading {} triples....'.format(mode))
        heads = []
        tails = []
        rels = []
        with open(path) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                triple = line.strip().split('\t')
                h, r, t = triple[format[0]], triple[format[1]], triple[format[2]]
                heads.append(self.entity2id[h])
                rels.append(self.relation2id[r])
                tails.append(self.entity2id[t])

        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)
        print('Finished. Read {} {} triples.'.format(len(heads), mode))

        return (heads, rels, tails)


class PartitionKGDataset():
    '''Load a partitioned knowledge graph

    The folder with a partitioned knowledge graph has four files:
    * relations stores the mapping between relation Id and relation name.
    * train stores the triples in the training set.
    * local_to_global stores the mapping of local id and global id
    * partition_book stores the machine id of each entity

    The triples are stored as 'head_id\relation_id\tail_id'.
    '''
    def __init__(self, relation_path, train_path, local2global_path, 
                 read_triple=True, skip_first_line=False):
        self.n_entities = _file_line(local2global_path)
        if skip_first_line == False:
            self.n_relations = _file_line(relation_path)
        else:
            self.n_relations = _file_line(relation_path) - 1
        if read_triple == True:
            self.train = self.read_triple(train_path, "train")

    def read_triple(self, path, mode):
        heads = []
        tails = []
        rels = []
        print('Reading {} triples....'.format(mode))
        with open(path) as f:
            for line in f:
                h, r, t = line.strip().split('\t')
                heads.append(int(h))
                rels.append(int(r))
                tails.append(int(t))

        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)
        print('Finished. Read {} {} triples.'.format(len(heads), mode))

        return (heads, rels, tails)


class KGDatasetFB15k(KGDataset):
    '''Load a knowledge graph FB15k

    The FB15k dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='FB15k'):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetFB15k, self).__init__(os.path.join(self.path, 'entities.dict'),
                                             os.path.join(self.path, 'relations.dict'),
                                             os.path.join(self.path, 'train.txt'),
                                             os.path.join(self.path, 'valid.txt'),
                                             os.path.join(self.path, 'test.txt'))


class KGDatasetFB15k237(KGDataset):
    '''Load a knowledge graph FB15k-237

    The FB15k-237 dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='FB15k-237'):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetFB15k237, self).__init__(os.path.join(self.path, 'entities.dict'),
                                                os.path.join(self.path, 'relations.dict'),
                                                os.path.join(self.path, 'train.txt'),
                                                os.path.join(self.path, 'valid.txt'),
                                                os.path.join(self.path, 'test.txt'))


class KGDatasetWN18(KGDataset):
    '''Load a knowledge graph wn18

    The wn18 dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='wn18'):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetWN18, self).__init__(os.path.join(self.path, 'entities.dict'),
                                            os.path.join(self.path, 'relations.dict'),
                                            os.path.join(self.path, 'train.txt'),
                                            os.path.join(self.path, 'valid.txt'),
                                            os.path.join(self.path, 'test.txt'))


class KGDatasetWN18rr(KGDataset):
    '''Load a knowledge graph wn18rr

    The wn18rr dataset has five files:
    * entities.dict stores the mapping between entity Id and entity name.
    * relations.dict stores the mapping between relation Id and relation name.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='wn18rr'):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, name + '.zip')
        self.path = os.path.join(path, name)

        super(KGDatasetWN18rr, self).__init__(os.path.join(self.path, 'entities.dict'),
                                              os.path.join(self.path, 'relations.dict'),
                                              os.path.join(self.path, 'train.txt'),
                                              os.path.join(self.path, 'valid.txt'),
                                              os.path.join(self.path, 'test.txt'))

class KGDatasetFreebase(KGDataset):
    '''Load a knowledge graph Full Freebase

    The Freebase dataset has five files:
    * entity2id.txt stores the mapping between entity name and entity Id.
    * relation2id.txt stores the mapping between relation name relation Id.
    * train.txt stores the triples in the training set.
    * valid.txt stores the triples in the validation set.
    * test.txt stores the triples in the test set.

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name='Freebase'):
        self.name = name
        url = 'https://data.dgl.ai/dataset/{}.zip'.format(name)

        if not os.path.exists(os.path.join(path, name)):
            print('File not found. Downloading from', url)
            _download_and_extract(url, path, '{}.zip'.format(name))
        self.path = os.path.join(path, name)

        super(KGDatasetFreebase, self).__init__(os.path.join(self.path, 'entity2id.txt'),
                                                os.path.join(self.path, 'relation2id.txt'),
                                                os.path.join(self.path, 'train.txt'),
                                                os.path.join(self.path, 'valid.txt'),
                                                os.path.join(self.path, 'test.txt'))

    def read_entity(self, entity_path):
        with open(entity_path) as f_ent:
            n_entities = int(f_ent.readline()[:-1])
        return None, n_entities

    def read_relation(self, relation_path):
        with open(relation_path) as f_rel:
            n_relations = int(f_rel.readline()[:-1])
        return None, n_relations

    def read_triple(self, path, mode, skip_first_line=False, format=None):
        heads = []
        tails = []
        rels = []
        print('Reading {} triples....'.format(mode))
        with open(path) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                h, t, r = line.strip().split('\t')
                heads.append(int(h))
                tails.append(int(t))
                rels.append(int(r))

        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)
        print('Finished. Read {} {} triples.'.format(len(heads), mode))
        return (heads, rels, tails)

class KGDatasetUDDRaw(KGDataset):
    '''Load a knowledge graph user defined dataset

    The user defined dataset has five files:
    * entities stores the mapping between entity name and entity Id.
    * relations stores the mapping between relation name relation Id.
    * train stores the triples in the training set. In format [src_name, rel_name, dst_name]
    * valid stores the triples in the validation set. In format [src_name, rel_name, dst_name]
    * test stores the triples in the test set. In format [src_name, rel_name, dst_name]

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name, files, format):
        self.name = name
        for f in files:
            assert os.path.exists(os.path.join(path, f)), \
                'File {} now exist in {}'.format(f, path)

        assert len(format) == 3
        format = _parse_srd_format(format)
        self.load_entity_relation(path, files, format)

        # Only train set is provided
        if len(files) == 1:
            super(KGDatasetUDDRaw, self).__init__("entities.tsv",
                                                  "relation.tsv",
                                                  os.path.join(path, files[0]),
                                                  format=format)
        # Train, validation and test set are provided
        if len(files) == 3:
            super(KGDatasetUDDRaw, self).__init__("entities.tsv",
                                                  "relation.tsv",
                                                  os.path.join(path, files[0]),
                                                  os.path.join(path, files[1]),
                                                  os.path.join(path, files[2]),
                                                  format=format)

    def load_entity_relation(self, path, files, format):
        entity_map = {}
        rel_map = {}
        for fi in files:
            with open(os.path.join(path, fi)) as f:
                for line in f:
                    triple = line.strip().split('\t')
                    src, rel, dst = triple[format[0]], triple[format[1]], triple[format[2]]
                    src_id = _get_id(entity_map, src)
                    dst_id = _get_id(entity_map, dst)
                    rel_id = _get_id(rel_map, rel)

        entities = ["{}\t{}\n".format(key, val) for key, val in entity_map.items()]
        with open(os.path.join(path, "entities.tsv"), "w+") as f:
            f.writelines(entities)
        self.entity2id = entity_map
        self.n_entities = len(entities)

        relations = ["{}\t{}\n".format(key, val) for key, val in rel_map.items()]
        with open(os.path.join(path, "relations.tsv"), "w+") as f:
            f.writelines(relations)
        self.relation2id = rel_map
        self.n_relations = len(relations)

    def read_entity(self, entity_path):
        return self.entity2id, self.n_entities
    
    def read_relation(self, relation_path):
        return self.relation2id, self.n_relations

class KGDatasetUDD(KGDataset):
    '''Load a knowledge graph user defined dataset

    The user defined dataset has five files:
    * entities stores the mapping between entity name and entity Id.
    * relations stores the mapping between relation name relation Id.
    * train stores the triples in the training set. In format [src_id, rel_id, dst_id]
    * valid stores the triples in the validation set. In format [src_id, rel_id, dst_id]
    * test stores the triples in the test set. In format [src_id, rel_id, dst_id]

    The mapping between entity (relation) name and entity (relation) Id is stored as 'name\tid'.
    The triples are stored as 'head_nid\trelation_id\ttail_nid'.
    '''
    def __init__(self, path, name, files, format):
        self.name = name
        for f in files:
            assert os.path.exists(os.path.join(path, f)), \
                'File {} now exist in {}'.format(f, path)

        format = _parse_srd_format(format)
        if len(files) == 3:
            super(KGDatasetUDD, self).__init__(os.path.join(path, files[0]),
                                               os.path.join(path, files[1]),
                                               os.path.join(path, files[2]),
                                               None, None,
                                               format=format)
        if len(files) == 5:
            super(KGDatasetUDD, self).__init__(os.path.join(path, files[0]),
                                               os.path.join(path, files[1]),
                                               os.path.join(path, files[2]),
                                               os.path.join(path, files[3]),
                                               os.path.join(path, files[4]),
                                               format=format)

    def read_entity(self, entity_path):
        n_entities = 0
        with open(entity_path) as f_ent:
            for line in f_ent:
                n_entities += 1
        return None, n_entities

    def read_relation(self, relation_path):
        n_relations = 0
        with open(relation_path) as f_rel:
            for line in f_rel:
                n_relations += 1
        return None, n_relations

    def read_triple(self, path, mode, skip_first_line=False, format=[0,1,2]):
        heads = []
        tails = []
        rels = []
        print('Reading {} triples....'.format(mode))
        with open(path) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                triple = line.strip().split('\t')
                h, r, t = triple[format[0]], triple[format[1]], triple[format[2]]
                heads.append(int(h))
                tails.append(int(t))
                rels.append(int(r))
        heads = np.array(heads, dtype=np.int64)
        tails = np.array(tails, dtype=np.int64)
        rels = np.array(rels, dtype=np.int64)
        print('Finished. Read {} {} triples.'.format(len(heads), mode))
        return (heads, rels, tails)

def get_dataset(data_path, data_name, format_str, files=None):
    if format_str == 'built_in':
        if data_name == 'Freebase':
            dataset = KGDatasetFreebase(data_path)
        elif data_name == 'FB15k':
            dataset = KGDatasetFB15k(data_path)
        elif data_name == 'FB15k-237':
            dataset = KGDatasetFB15k237(data_path)
        elif data_name == 'wn18':
            dataset = KGDatasetWN18(data_path)
        elif data_name == 'wn18rr':
            dataset = KGDatasetWN18rr(data_path)
        else: 
            assert False, "Unknown dataset {}".format(data_name)
    elif format_str.startswith('raw_udd'):
        # user defined dataset
        format = format_str[8:]
        dataset = KGDatasetUDDRaw(data_path, data_name, files, format)
    elif format_str.startswith('udd'):
        # user defined dataset
        format = format_str[4:]
        dataset = KGDatasetUDD(data_path, data_name, files, format)
    else:
        assert False, "Unknown format {}".format(format_str)

    return dataset


def get_partition_dataset(data_path, data_name, part_id):
    part_name = os.path.join(data_name, 'partition_'+str(part_id))
    path = os.path.join(data_path, part_name)

    if not os.path.exists(path):
        print('Partition file not found.')
        exit()

    train_path = os.path.join(path, 'train.txt')
    local2global_path = os.path.join(path, 'local_to_global.txt')
    partition_book_path = os.path.join(path, 'partition_book.txt')

    if data_name == 'Freebase':
        relation_path = os.path.join(path, 'relation2id.txt')
        skip_first_line = True
    elif data_name in ['FB15k', 'FB15k-237', 'wn18', 'wn18rr']:
        relation_path = os.path.join(path, 'relations.dict')
        skip_first_line = False
    else:
        relation_path = os.path.join(path, 'relation.tsv')
        skip_first_line = False

    dataset = PartitionKGDataset(relation_path, 
                                 train_path, 
                                 local2global_path, 
                                 read_triple=True, 
                                 skip_first_line=skip_first_line)

    partition_book = []
    with open(partition_book_path) as f:
        for line in f:
            partition_book.append(int(line))

    local_to_global = []
    with open(local2global_path) as f:
        for line in f:
            local_to_global.append(int(line))

    return dataset, partition_book, local_to_global


def get_server_partition_dataset(data_path, data_name, part_id):
    part_name = os.path.join(data_name, 'partition_'+str(part_id))
    path = os.path.join(data_path, part_name)

    if not os.path.exists(path):
        print('Partition file not found.')
        exit()

    train_path = os.path.join(path, 'train.txt')
    local2global_path = os.path.join(path, 'local_to_global.txt')    

    if data_name == 'Freebase':
        relation_path = os.path.join(path, 'relation2id.txt')
        skip_first_line = True
    elif data_name in ['FB15k', 'FB15k-237', 'wn18', 'wn18rr']:
        relation_path = os.path.join(path, 'relations.dict')
        skip_first_line = False
    else:
        relation_path = os.path.join(path, 'relation.tsv')
        skip_first_line = False

    dataset = PartitionKGDataset(relation_path,
                                 train_path,
                                 local2global_path,
                                 read_triple=False,
                                 skip_first_line=skip_first_line)

    n_entities = _file_line(os.path.join(path, 'partition_book.txt'))

    local_to_global = []
    with open(local2global_path) as f:
        for line in f:
            local_to_global.append(int(line))

    global_to_local = [0] * n_entities
    for i in range(len(local_to_global)):
        global_id = local_to_global[i]
        global_to_local[global_id] = i

    local_to_global = None

    return global_to_local, dataset