Commit de2e6515 authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.4.1-dtk-23.04

parent ad08b8ce
Pipeline #228 failed with stages
in 0 seconds
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mct/hash-map.hpp>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/distributed/common/chunk_allocator.h"
namespace paddle {
namespace distributed {
static const int CTR_SPARSE_SHARD_BUCKET_NUM_BITS = 6;
static const size_t CTR_SPARSE_SHARD_BUCKET_NUM =
static_cast<size_t>(1) << CTR_SPARSE_SHARD_BUCKET_NUM_BITS;
class FixedFeatureValue {
public:
FixedFeatureValue() {}
~FixedFeatureValue() {}
float* data() { return _data.data(); }
size_t size() { return _data.size(); }
void resize(size_t size) { _data.resize(size); }
void shrink_to_fit() { _data.shrink_to_fit(); }
private:
std::vector<float> _data;
};
template <class KEY, class VALUE>
struct alignas(64) SparseTableShard {
public:
typedef typename mct::closed_hash_map<KEY, mct::Pointer, std::hash<KEY>>
map_type;
struct iterator {
typename map_type::iterator it;
size_t bucket;
map_type* buckets;
friend bool operator==(const iterator& a, const iterator& b) {
return a.it == b.it;
}
friend bool operator!=(const iterator& a, const iterator& b) {
return a.it != b.it;
}
const KEY& key() const { return it->first; }
VALUE& value() const { return *(VALUE*)(void*)it->second; } // NOLINT
VALUE* value_ptr() const { return (VALUE*)(void*)it->second; } // NOLINT
iterator& operator++() {
++it;
while (it == buckets[bucket].end() &&
bucket + 1 < CTR_SPARSE_SHARD_BUCKET_NUM) {
it = buckets[++bucket].begin();
}
return *this;
}
iterator operator++(int) {
iterator ret = *this;
++*this;
return ret;
}
};
struct local_iterator {
typename map_type::iterator it;
friend bool operator==(const local_iterator& a, const local_iterator& b) {
return a.it == b.it;
}
friend bool operator!=(const local_iterator& a, const local_iterator& b) {
return a.it != b.it;
}
const KEY& key() const { return it->first; }
VALUE& value() const { return *(VALUE*)(void*)it->second; } // NOLINT
local_iterator& operator++() {
++it;
return *this;
}
local_iterator operator++(int) { return {it++}; }
};
~SparseTableShard() { clear(); }
bool empty() { return _alloc.size() == 0; }
size_t size() { return _alloc.size(); }
void set_max_load_factor(float x) {
for (size_t bucket = 0; bucket < CTR_SPARSE_SHARD_BUCKET_NUM; bucket++) {
_buckets[bucket].max_load_factor(x);
}
}
size_t bucket_count() { return CTR_SPARSE_SHARD_BUCKET_NUM; }
size_t bucket_size(size_t bucket) { return _buckets[bucket].size(); }
void clear() {
for (size_t bucket = 0; bucket < CTR_SPARSE_SHARD_BUCKET_NUM; bucket++) {
map_type& data = _buckets[bucket];
for (auto it = data.begin(); it != data.end(); ++it) {
_alloc.release((VALUE*)(void*)it->second); // NOLINT
}
data.clear();
}
}
iterator begin() {
auto it = _buckets[0].begin();
size_t bucket = 0;
while (it == _buckets[bucket].end() &&
bucket + 1 < CTR_SPARSE_SHARD_BUCKET_NUM) {
it = _buckets[++bucket].begin();
}
return {it, bucket, _buckets};
}
iterator end() {
return {_buckets[CTR_SPARSE_SHARD_BUCKET_NUM - 1].end(),
CTR_SPARSE_SHARD_BUCKET_NUM - 1,
_buckets};
}
local_iterator begin(size_t bucket) { return {_buckets[bucket].begin()}; }
local_iterator end(size_t bucket) { return {_buckets[bucket].end()}; }
iterator find(const KEY& key) {
size_t hash = _hasher(key);
size_t bucket = compute_bucket(hash);
auto it = _buckets[bucket].find_with_hash(key, hash);
if (it == _buckets[bucket].end()) {
return end();
}
return {it, bucket, _buckets};
}
VALUE& operator[](const KEY& key) { return emplace(key).first.value(); }
std::pair<iterator, bool> insert(const KEY& key, const VALUE& val) {
return emplace(key, val);
}
std::pair<iterator, bool> insert(const KEY& key, VALUE&& val) {
return emplace(key, std::move(val));
}
template <class... ARGS>
std::pair<iterator, bool> emplace(const KEY& key, ARGS&&... args) {
size_t hash = _hasher(key);
size_t bucket = compute_bucket(hash);
auto res = _buckets[bucket].insert_with_hash({key, NULL}, hash);
if (res.second) {
res.first->second = _alloc.acquire(std::forward<ARGS>(args)...);
}
return {{res.first, bucket, _buckets}, res.second};
}
iterator erase(iterator it) {
_alloc.release((VALUE*)(void*)it.it->second); // NOLINT
size_t bucket = it.bucket;
auto it2 = _buckets[bucket].erase(it.it);
while (it2 == _buckets[bucket].end() &&
bucket + 1 < CTR_SPARSE_SHARD_BUCKET_NUM) {
it2 = _buckets[++bucket].begin();
}
return {it2, bucket, _buckets};
}
void quick_erase(iterator it) {
_alloc.release((VALUE*)(void*)it.it->second); // NOLINT
_buckets[it.bucket].quick_erase(it.it);
}
local_iterator erase(size_t bucket, local_iterator it) {
_alloc.release((VALUE*)(void*)it.it->second); // NOLINT
return {_buckets[bucket].erase(it.it)};
}
void quick_erase(size_t bucket, local_iterator it) {
_alloc.release((VALUE*)(void*)it.it->second); // NOLINT
_buckets[bucket].quick_erase(it.it);
}
size_t erase(const KEY& key) {
auto it = find(key);
if (it == end()) {
return 0;
}
quick_erase(it);
return 1;
}
size_t compute_bucket(size_t hash) {
if (CTR_SPARSE_SHARD_BUCKET_NUM == 1) {
return 0;
} else {
return hash >> (sizeof(size_t) * 8 - CTR_SPARSE_SHARD_BUCKET_NUM_BITS);
}
}
private:
map_type _buckets[CTR_SPARSE_SHARD_BUCKET_NUM];
ChunkAllocator<VALUE> _alloc;
std::hash<KEY> _hasher;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <ThreadPool.h>
#include <future> // NOLINT
#include <memory>
#include <unordered_set>
#include <vector>
namespace paddle {
namespace distributed {
class ConcurrentSet {
public:
ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
~ConcurrentSet() {}
std::future<void> Update(const std::vector<uint64_t>& rows) {
auto task = [this, rows] {
for (auto row : rows) {
set_.insert(row);
}
};
return pool_->enqueue(std::move(task));
}
std::future<void> GetAndClear(std::vector<uint64_t>* result) {
auto task = [this, &result] {
result->clear();
for (auto& id : set_) {
result->push_back(id);
}
set_.clear();
};
return pool_->enqueue(std::move(task));
}
private:
std::unordered_set<uint64_t> set_;
std::unique_ptr<::ThreadPool> pool_{nullptr};
};
class GeoRecorder {
public:
explicit GeoRecorder(int trainer_num) : trainer_num_(trainer_num) {
trainer_rows_.reserve(trainer_num);
for (auto i = 0; i < trainer_num; ++i) {
trainer_rows_.emplace_back(new ConcurrentSet());
}
}
~GeoRecorder() = default;
void Update(const std::vector<uint64_t>& update_rows) {
VLOG(3) << " row size: " << update_rows.size();
std::vector<std::future<void>> fs;
for (auto& set : trainer_rows_) {
fs.push_back(set->Update(update_rows));
}
for (auto& f : fs) {
f.wait();
}
}
void GetAndClear(uint32_t trainer_id, std::vector<uint64_t>* result) {
VLOG(3) << "GetAndClear for trainer: " << trainer_id;
trainer_rows_.at(trainer_id)->GetAndClear(result).wait();
}
private:
const int trainer_num_;
std::vector<std::unique_ptr<ConcurrentSet>> trainer_rows_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include <random>
#include <string>
#include <utility>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
namespace paddle {
namespace distributed {
class Initializer {
public:
Initializer() {}
explicit Initializer(const std::vector<std::string> &attrs) {}
virtual float GetValue() = 0;
virtual void GetValue(std::vector<float> *values, int numel) {
for (int x = 0; x < numel; ++x) {
values->push_back(GetValue());
}
}
virtual void GetValue(float *value, int numel) {
for (int x = 0; x < numel; ++x) {
value[x] = GetValue();
}
}
virtual ~Initializer() {}
protected:
std::string name_;
unsigned int seed_;
};
class UniformInitializer : public Initializer {
public:
explicit UniformInitializer(const std::vector<std::string> &attrs) {
name_ = attrs[0];
seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
min_ = std::stof(attrs[2]);
max_ = std::stof(attrs[3]);
dist_ = std::uniform_real_distribution<float>(min_, max_);
random_engine_ = framework::GetCPURandomEngine(seed_);
}
float GetValue() override { return dist_(*random_engine_); }
void GetValue(float *value, int numel) {
for (int x = 0; x < numel; ++x) {
value[x] = dist_(*random_engine_);
}
}
private:
float min_;
float max_;
std::shared_ptr<std::mt19937_64> random_engine_;
std::uniform_real_distribution<float> dist_;
};
class GaussianInitializer : public Initializer {
public:
explicit GaussianInitializer(const std::vector<std::string> &attrs) {
name_ = attrs[0];
seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
mean_ = std::stof(attrs[2]);
std_ = std::stof(attrs[3]);
random_engine_ = framework::GetCPURandomEngine(seed_);
dist_ = std::normal_distribution<float>(mean_, std_);
}
float GetValue() override { return dist_(*random_engine_); }
void GetValue(float *value, int numel) {
for (int x = 0; x < numel; ++x) {
value[x] = dist_(*random_engine_);
}
}
private:
float std_;
float mean_;
std::shared_ptr<std::mt19937_64> random_engine_;
std::normal_distribution<float> dist_;
};
class TruncatedGaussianInitializer : public Initializer {
public:
explicit TruncatedGaussianInitializer(const std::vector<std::string> &attrs) {
name_ = attrs[0];
seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
mean_ = std::stof(attrs[2]);
std_ = std::stof(attrs[3]);
std::uniform_real_distribution<float> dist_(
std::numeric_limits<float>::min(), 1.0);
random_engine_ = framework::GetCPURandomEngine(seed_);
}
float GetValue() override {
paddle::operators::TruncatedNormal<float> truncated_normal(mean_, std_);
float value = truncated_normal(dist_(*random_engine_));
return value;
}
void GetValue(float *value, int numel) {
paddle::operators::TruncatedNormal<float> truncated_normal(mean_, std_);
for (int x = 0; x < numel; ++x) {
value[x] = truncated_normal(dist_(*random_engine_));
}
}
private:
float std_;
float mean_;
std::shared_ptr<std::mt19937_64> random_engine_;
std::uniform_real_distribution<float> dist_;
};
class FillConstantInitializer : public Initializer {
public:
explicit FillConstantInitializer(const std::vector<std::string> &attrs) {
name_ = attrs[0];
value_ = std::stof(attrs[1]);
}
float GetValue() override { return value_; }
void GetValue(float *value, int numel) { std::fill_n(value, numel, value_); }
private:
float value_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <glog/logging.h>
#include <rocksdb/db.h>
#include <rocksdb/filter_policy.h>
#include <rocksdb/options.h>
#include <rocksdb/slice.h>
#include <rocksdb/table.h>
#include <rocksdb/write_batch.h>
#include <iostream>
#include <string>
namespace paddle {
namespace distributed {
class RocksDBHandler {
public:
RocksDBHandler() {}
~RocksDBHandler() {}
static RocksDBHandler* GetInstance() {
static RocksDBHandler handler;
return &handler;
}
int initialize(const std::string& db_path, const int colnum) {
VLOG(3) << "db path: " << db_path << " colnum: " << colnum;
rocksdb::Options options;
rocksdb::BlockBasedTableOptions bbto;
bbto.block_size = 4 * 1024;
bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
bbto.cache_index_and_filter_blocks = false;
bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false));
bbto.whole_key_filtering = true;
options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
options.keep_log_file_num = 100;
options.max_log_file_size = 50 * 1024 * 1024; // 50MB
options.create_if_missing = true;
options.use_direct_reads = true;
options.max_background_flushes = 5;
options.max_background_compactions = 5;
options.base_background_compactions = 10;
options.write_buffer_size = 256 * 1024 * 1024; // 256MB
options.max_write_buffer_number = 8;
options.max_bytes_for_level_base =
options.max_write_buffer_number * options.write_buffer_size;
options.min_write_buffer_number_to_merge = 1;
options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB
options.memtable_prefix_bloom_size_ratio = 0.02;
options.num_levels = 4;
options.max_open_files = -1;
options.compression = rocksdb::kNoCompression;
options.level0_file_num_compaction_trigger = 8;
options.level0_slowdown_writes_trigger =
1.8 * options.level0_file_num_compaction_trigger;
options.level0_stop_writes_trigger =
3.6 * options.level0_file_num_compaction_trigger;
if (!db_path.empty()) {
std::string rm_cmd = "rm -rf " + db_path;
system(rm_cmd.c_str());
}
rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db);
assert(s.ok());
_handles.resize(colnum);
for (int i = 0; i < colnum; i++) {
s = _db->CreateColumnFamily(
options, "shard_" + std::to_string(i), &_handles[i]);
assert(s.ok());
}
LOG(INFO) << "DB initialize success, colnum:" << colnum;
return 0;
}
int put(
int id, const char* key, int key_len, const char* value, int value_len) {
rocksdb::WriteOptions options;
options.disableWAL = true;
rocksdb::Status s = _db->Put(options,
_handles[id],
rocksdb::Slice(key, key_len),
rocksdb::Slice(value, value_len));
assert(s.ok());
return 0;
}
int put_batch(int id,
std::vector<std::pair<char*, int>>& ssd_keys,
std::vector<std::pair<char*, int>>& ssd_values,
int n) {
rocksdb::WriteOptions options;
options.disableWAL = true;
rocksdb::WriteBatch batch(n * 128);
for (int i = 0; i < n; i++) {
batch.Put(_handles[id],
rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
rocksdb::Slice(ssd_values[i].first, ssd_values[i].second));
}
rocksdb::Status s = _db->Write(options, &batch);
assert(s.ok());
return 0;
}
int get(int id, const char* key, int key_len, std::string& value) {
rocksdb::Status s = _db->Get(rocksdb::ReadOptions(),
_handles[id],
rocksdb::Slice(key, key_len),
&value);
if (s.IsNotFound()) {
return 1;
}
assert(s.ok());
return 0;
}
int del_data(int id, const char* key, int key_len) {
rocksdb::WriteOptions options;
options.disableWAL = true;
rocksdb::Status s =
_db->Delete(options, _handles[id], rocksdb::Slice(key, key_len));
assert(s.ok());
return 0;
}
int flush(int id) {
rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]);
assert(s.ok());
return 0;
}
rocksdb::Iterator* get_iterator(int id) {
return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]);
}
int get_estimate_key_num(uint64_t& num_keys) {
_db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
return 0;
}
private:
std::vector<rocksdb::ColumnFamilyHandle*> _handles;
rocksdb::DB* _db;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>
namespace paddle {
namespace distributed {
struct PullSparseValue {
PullSparseValue() {}
explicit PullSparseValue(int numel, int dim)
: numel_(numel),
dim_(dim),
is_training_(true),
feasigns_(nullptr),
frequencies_(nullptr) {}
explicit PullSparseValue(std::vector<uint64_t>& feasigns, // NOLINT
std::vector<uint32_t>& frequencies, // NOLINT
int dim) {
numel_ = feasigns.size();
dim_ = dim;
is_training_ = true;
feasigns_ = feasigns.data();
frequencies_ = frequencies.data();
}
void DeserializeFromBytes(void* bytes) {
/*
|---isTraining--------------|
|---8*{num}B(keysData)------|
|---4*{num}B(Frequencies)---|
*/
auto* begin = reinterpret_cast<char*>(bytes);
is_training_ = reinterpret_cast<bool*>(begin)[0];
feasigns_ = reinterpret_cast<uint64_t*>(begin + sizeof(bool));
frequencies_ = reinterpret_cast<uint32_t*>(begin + sizeof(bool) +
sizeof(uint64_t) * numel_);
}
void Fission(const int shard_id,
const int shard_num,
std::vector<int>* offset_shard) const {
offset_shard->reserve(numel_ / shard_num + 1);
for (int x = 0; x < numel_; ++x) {
if (int(feasigns_[x] % shard_num) == shard_id) {
offset_shard->push_back(x);
}
}
}
int numel_;
int dim_;
bool is_training_;
uint64_t* feasigns_;
uint32_t* frequencies_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a;
#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a)
#define DECLARE_2_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_3_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_4_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_5_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_6_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_7_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_8_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_9_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_10_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__)
#define DECLARE_11_FRIEND_CLASS(a, ...) \
DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__)
#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \
DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
#include <cstring>
namespace paddle {
namespace distributed {
void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
id_arr.push_back(id);
}
void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
id_arr.push_back(id);
weight_arr.push_back(weight);
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <cstdint>
#include <vector>
namespace paddle {
namespace distributed {
class GraphEdgeBlob {
public:
GraphEdgeBlob() {}
virtual ~GraphEdgeBlob() {}
size_t size() { return id_arr.size(); }
virtual void add_edge(int64_t id, float weight);
int64_t get_id(int idx) { return id_arr[idx]; }
virtual float get_weight(int idx) { return 1; }
std::vector<int64_t>& export_id_array() { return id_arr; }
protected:
std::vector<int64_t> id_arr;
};
class WeightedGraphEdgeBlob : public GraphEdgeBlob {
public:
WeightedGraphEdgeBlob() {}
virtual ~WeightedGraphEdgeBlob() {}
virtual void add_edge(int64_t id, float weight);
virtual float get_weight(int idx) { return weight_arr[idx]; }
protected:
std::vector<float> weight_arr;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
#include <cstring>
namespace paddle {
namespace distributed {
GraphNode::~GraphNode() {
if (sampler != nullptr) {
delete sampler;
sampler = nullptr;
}
if (edges != nullptr) {
delete edges;
edges = nullptr;
}
}
int Node::weight_size = sizeof(float);
int Node::id_size = sizeof(uint64_t);
int Node::int_size = sizeof(int);
int Node::get_size(bool need_feature) { return id_size + int_size; }
void Node::to_buffer(char* buffer, bool need_feature) {
memcpy(buffer, &id, id_size);
buffer += id_size;
int feat_num = 0;
memcpy(buffer, &feat_num, sizeof(int));
}
void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
int FeatureNode::get_size(bool need_feature) {
int size = id_size + int_size; // id, feat_num
if (need_feature) {
size += feature.size() * int_size;
for (const std::string& fea : feature) {
size += fea.size();
}
}
return size;
}
void GraphNode::build_edges(bool is_weighted) {
if (edges == nullptr) {
if (is_weighted == true) {
edges = new WeightedGraphEdgeBlob();
} else {
edges = new GraphEdgeBlob();
}
}
}
void GraphNode::build_sampler(std::string sample_type) {
if (sampler != nullptr) {
return;
}
if (sample_type == "random") {
sampler = new RandomSampler();
} else if (sample_type == "weighted") {
sampler = new WeightedSampler();
}
sampler->build(edges);
}
void FeatureNode::to_buffer(char* buffer, bool need_feature) {
memcpy(buffer, &id, id_size);
buffer += id_size;
int feat_num = 0;
int feat_len;
if (need_feature) {
feat_num += feature.size();
memcpy(buffer, &feat_num, sizeof(int));
buffer += sizeof(int);
for (int i = 0; i < feat_num; ++i) {
feat_len = feature[i].size();
memcpy(buffer, &feat_len, sizeof(int));
buffer += sizeof(int);
memcpy(buffer, feature[i].c_str(), feature[i].size());
buffer += feature[i].size();
}
} else {
memcpy(buffer, &feat_num, sizeof(int));
}
}
void FeatureNode::recover_from_buffer(char* buffer) {
int feat_num, feat_len;
memcpy(&id, buffer, id_size);
buffer += id_size;
memcpy(&feat_num, buffer, sizeof(int));
buffer += sizeof(int);
feature.clear();
for (int i = 0; i < feat_num; ++i) {
memcpy(&feat_len, buffer, sizeof(int));
buffer += sizeof(int);
char str[feat_len + 1];
memcpy(str, buffer, feat_len);
buffer += feat_len;
str[feat_len] = '\0';
feature.push_back(std::string(str));
}
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstring>
#include <iostream>
#include <memory>
#include <set>
#include <sstream>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace distributed {
class Node {
public:
Node() {}
Node(uint64_t id) : id(id) {}
virtual ~Node() {}
static int id_size, int_size, weight_size;
uint64_t get_id() { return id; }
int64_t get_py_id() { return (int64_t)id; }
void set_id(uint64_t id) { this->id = id; }
virtual void build_edges(bool is_weighted) {}
virtual void build_sampler(std::string sample_type) {}
virtual void add_edge(uint64_t id, float weight) {}
virtual std::vector<int> sample_k(
int k, const std::shared_ptr<std::mt19937_64> rng) {
return std::vector<int>();
}
virtual uint64_t get_neighbor_id(int idx) { return 0; }
virtual float get_neighbor_weight(int idx) { return 1.; }
virtual int get_size(bool need_feature);
virtual void to_buffer(char *buffer, bool need_feature);
virtual void recover_from_buffer(char *buffer);
virtual std::string get_feature(int idx) { return std::string(""); }
virtual int get_feature_ids(std::vector<uint64_t> *res) const { return 0; }
virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
return 0;
}
virtual void set_feature(int idx, const std::string &str) {}
virtual void set_feature_size(int size) {}
virtual int get_feature_size() { return 0; }
virtual size_t get_neighbor_size() { return 0; }
protected:
uint64_t id;
bool is_weighted;
};
class GraphNode : public Node {
public:
GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
virtual ~GraphNode();
virtual void build_edges(bool is_weighted);
virtual void build_sampler(std::string sample_type);
virtual void add_edge(uint64_t id, float weight) {
edges->add_edge(id, weight);
}
virtual std::vector<int> sample_k(
int k, const std::shared_ptr<std::mt19937_64> rng) {
return sampler->sample_k(k, rng);
}
virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
virtual size_t get_neighbor_size() { return edges->size(); }
protected:
Sampler *sampler;
GraphEdgeBlob *edges;
};
class FeatureNode : public Node {
public:
FeatureNode() : Node() {}
FeatureNode(uint64_t id) : Node(id) {}
virtual ~FeatureNode() {}
virtual int get_size(bool need_feature);
virtual void to_buffer(char *buffer, bool need_feature);
virtual void recover_from_buffer(char *buffer);
virtual std::string get_feature(int idx) {
if (idx < (int)this->feature.size()) {
return this->feature[idx];
} else {
return std::string("");
}
}
virtual int get_feature_ids(std::vector<uint64_t> *res) const {
PADDLE_ENFORCE_NOT_NULL(res,
paddle::platform::errors::InvalidArgument(
"get_feature_ids res should not be null"));
errno = 0;
for (auto &feature_item : feature) {
const uint64_t *feas = (const uint64_t *)(feature_item.c_str());
size_t num = feature_item.length() / sizeof(uint64_t);
CHECK((feature_item.length() % sizeof(uint64_t)) == 0)
<< "bad feature_item: [" << feature_item << "]";
size_t n = res->size();
res->resize(n + num);
for (size_t i = 0; i < num; ++i) {
(*res)[n + i] = feas[i];
}
}
PADDLE_ENFORCE_EQ(
errno,
0,
paddle::platform::errors::InvalidArgument(
"get_feature_ids get errno should be 0, but got %d.", errno));
return 0;
}
virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
PADDLE_ENFORCE_NOT_NULL(res,
paddle::platform::errors::InvalidArgument(
"get_feature_ids res should not be null"));
res->clear();
errno = 0;
if (slot_idx < (int)this->feature.size()) {
const std::string &s = this->feature[slot_idx];
const uint64_t *feas = (const uint64_t *)(s.c_str());
size_t num = s.length() / sizeof(uint64_t);
CHECK((s.length() % sizeof(uint64_t)) == 0)
<< "bad feature_item: [" << s << "]";
res->resize(num);
for (size_t i = 0; i < num; ++i) {
(*res)[i] = feas[i];
}
}
PADDLE_ENFORCE_EQ(
errno,
0,
paddle::platform::errors::InvalidArgument(
"get_feature_ids get errno should be 0, but got %d.", errno));
return 0;
}
virtual std::string *mutable_feature(int idx) {
if (idx >= (int)this->feature.size()) {
this->feature.resize(idx + 1);
}
return &(this->feature[idx]);
}
virtual void set_feature(int idx, const std::string &str) {
if (idx >= (int)this->feature.size()) {
this->feature.resize(idx + 1);
}
this->feature[idx] = str;
}
virtual void set_feature_size(int size) { this->feature.resize(size); }
virtual int get_feature_size() { return this->feature.size(); }
template <typename T>
static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
T v;
size_t Tsize = sizeof(T) * feat_str.size();
char buffer[Tsize];
for (size_t i = 0; i < feat_str.size(); i++) {
std::stringstream ss(feat_str[i]);
ss >> v;
std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
}
return std::string(buffer, Tsize);
}
template <typename T>
static void parse_value_to_bytes(
std::vector<std::string>::iterator feat_str_begin,
std::vector<std::string>::iterator feat_str_end,
std::string *output) {
T v;
size_t feat_str_size = feat_str_end - feat_str_begin;
size_t Tsize = sizeof(T) * feat_str_size;
char buffer[Tsize] = {'\0'};
for (size_t i = 0; i < feat_str_size; i++) {
std::stringstream ss(*(feat_str_begin + i));
ss >> v;
std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
}
output->assign(buffer);
}
template <typename T>
static std::vector<T> parse_bytes_to_array(std::string feat_str) {
T v;
std::vector<T> out;
size_t start = 0;
const char *buffer = feat_str.data();
while (start < feat_str.size()) {
std::memcpy((char *)&v, buffer + start, sizeof(T));
start += sizeof(T);
out.push_back(v);
}
return out;
}
template <typename T>
static void parse_value_to_bytes(
std::vector<paddle::string::str_ptr>::iterator feat_str_begin,
std::vector<paddle::string::str_ptr>::iterator feat_str_end,
std::string *output) {
size_t feat_str_size = feat_str_end - feat_str_begin;
size_t Tsize = sizeof(T) * feat_str_size;
size_t num = output->length();
output->resize(num + Tsize);
T *fea_ptrs = (T *)(&(*output)[num]);
thread_local paddle::string::str_ptr_stream ss;
for (size_t i = 0; i < feat_str_size; i++) {
ss.reset(*(feat_str_begin + i));
ss >> fea_ptrs[i];
}
}
protected:
std::vector<std::string> feature;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
#include <iostream>
#include <memory>
#include <unordered_map>
#include "paddle/fluid/framework/generator.h"
namespace paddle {
namespace distributed {
void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
std::vector<int> RandomSampler::sample_k(
int k, const std::shared_ptr<std::mt19937_64> rng) {
int n = edges->size();
if (k >= n) {
k = n;
std::vector<int> sample_result;
for (int i = 0; i < k; i++) {
sample_result.push_back(i);
}
return sample_result;
}
std::vector<int> sample_result;
std::unordered_map<int, int> replace_map;
while (k--) {
std::uniform_int_distribution<int> distrib(0, n - 1);
int rand_int = distrib(*rng);
auto iter = replace_map.find(rand_int);
if (iter == replace_map.end()) {
sample_result.push_back(rand_int);
} else {
sample_result.push_back(iter->second);
}
iter = replace_map.find(n - 1);
if (iter == replace_map.end()) {
replace_map[rand_int] = n - 1;
} else {
replace_map[rand_int] = iter->second;
}
--n;
}
return sample_result;
}
WeightedSampler::WeightedSampler() {
left = nullptr;
right = nullptr;
edges = nullptr;
}
WeightedSampler::~WeightedSampler() {
if (left != nullptr) {
delete left;
left = nullptr;
}
if (right != nullptr) {
delete right;
right = nullptr;
}
}
void WeightedSampler::build(GraphEdgeBlob *edges) {
if (left != nullptr) {
delete left;
left = nullptr;
}
if (right != nullptr) {
delete right;
right = nullptr;
}
return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
}
void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges,
int start,
int end) {
count = 0;
this->edges = edges;
if (start + 1 == end) {
left = right = nullptr;
idx = start;
count = 1;
weight = edges->get_weight(idx);
} else {
left = new WeightedSampler();
right = new WeightedSampler();
left->build_one(edges, start, start + (end - start) / 2);
right->build_one(edges, start + (end - start) / 2, end);
weight = left->weight + right->weight;
count = left->count + right->count;
}
}
std::vector<int> WeightedSampler::sample_k(
int k, const std::shared_ptr<std::mt19937_64> rng) {
if (k >= count) {
k = count;
std::vector<int> sample_result;
for (int i = 0; i < k; i++) {
sample_result.push_back(i);
}
return sample_result;
}
std::vector<int> sample_result;
float subtract;
std::unordered_map<WeightedSampler *, float> subtract_weight_map;
std::unordered_map<WeightedSampler *, int> subtract_count_map;
std::uniform_real_distribution<float> distrib(0, 1.0);
while (k--) {
float query_weight = distrib(*rng);
query_weight *= weight - subtract_weight_map[this];
sample_result.push_back(sample(
query_weight, subtract_weight_map, subtract_count_map, subtract));
}
return sample_result;
}
int WeightedSampler::sample(
float query_weight,
std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
std::unordered_map<WeightedSampler *, int> &subtract_count_map,
float &subtract) {
if (left == nullptr) {
subtract_weight_map[this] = weight;
subtract = weight;
subtract_count_map[this] = 1;
return idx;
}
int left_count = left->count - subtract_count_map[left];
int right_count = right->count - subtract_count_map[right];
float left_subtract = subtract_weight_map[left];
int return_idx;
if (right_count == 0 ||
left_count > 0 && left->weight - left_subtract >= query_weight) {
return_idx = left->sample(
query_weight, subtract_weight_map, subtract_count_map, subtract);
} else {
return_idx = right->sample(query_weight - (left->weight - left_subtract),
subtract_weight_map,
subtract_count_map,
subtract);
}
subtract_weight_map[this] += subtract;
subtract_count_map[this]++;
return return_idx;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <ctime>
#include <memory>
#include <random>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
namespace paddle {
namespace distributed {
class Sampler {
public:
virtual ~Sampler() {}
virtual void build(GraphEdgeBlob *edges) = 0;
virtual std::vector<int> sample_k(
int k, const std::shared_ptr<std::mt19937_64> rng) = 0;
};
class RandomSampler : public Sampler {
public:
virtual ~RandomSampler() {}
virtual void build(GraphEdgeBlob *edges);
virtual std::vector<int> sample_k(int k,
const std::shared_ptr<std::mt19937_64> rng);
GraphEdgeBlob *edges;
};
class WeightedSampler : public Sampler {
public:
WeightedSampler();
virtual ~WeightedSampler();
WeightedSampler *left, *right;
float weight;
int count;
int idx;
GraphEdgeBlob *edges;
virtual void build(GraphEdgeBlob *edges);
virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
virtual std::vector<int> sample_k(int k,
const std::shared_ptr<std::mt19937_64> rng);
private:
int sample(float query_weight,
std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
std::unordered_map<WeightedSampler *, int> &subtract_count_map,
float &subtract);
};
} // namespace distributed
} // namespace paddle
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment