Merge pull request #1084 from kvcache-ai/fix-config

format kvc2, delete quant_configs, move model_configs to ~/.ktransfor…

Merge pull request #1084 from kvcache-ai/fix-config
format kvc2, delete quant_configs, move model_configs to ~/.ktransfor…
41ce92bb · wang jiahao · GitHub · 10fd2e28 · 64de7843 · 41ce92bb
Unverified Commit 41ce92bb authored Apr 08, 2025 by wang jiahao Committed by GitHub Apr 08, 2025
11 changed files
--- a/csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp
+++ b/csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp
 #include <atomic>

-template <typename T>
-struct AtomicPtrWithFlag {
+template <typename T> struct AtomicPtrWithFlag {
  constexpr static uint64_t mask = 1ull << 63;
  std::atomic_uint64_t ptr = 0;

-  std::pair<T*, bool> load(std::memory_order order = std::memory_order_seq_cst) {
+  std::pair<T *, bool>
+  load(std::memory_order order = std::memory_order_seq_cst) {
    uint64_t val = ptr.load(order);
-    return {reinterpret_cast<T*>(val & (~mask)), val & mask};
+    return {reinterpret_cast<T *>(val & (~mask)), val & mask};
  }

-  void store(T* p, bool flag, std::memory_order order = std::memory_order_seq_cst) {
+  void store(T *p, bool flag,
+             std::memory_order order = std::memory_order_seq_cst) {
    ptr.store(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
  }

-  std::pair<T*, bool> exchange(T* p, bool flag, std::memory_order order = std::memory_order_seq_cst) {
-    uint64_t val = ptr.exchange(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
-    return {reinterpret_cast<T*>(val & (~mask)), val & mask};
+  std::pair<T *, bool>
+  exchange(T *p, bool flag,
+           std::memory_order order = std::memory_order_seq_cst) {
+    uint64_t val =
+        ptr.exchange(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
+    return {reinterpret_cast<T *>(val & (~mask)), val & mask};
  }

-  std::pair<T*, bool> touch_load(std::memory_order order = std::memory_order_seq_cst) {
+  std::pair<T *, bool>
+  touch_load(std::memory_order order = std::memory_order_seq_cst) {
    uint64_t val = ptr.fetch_and(~mask, order);
-    return {reinterpret_cast<T*>(val & (~mask)), val & mask};
+    return {reinterpret_cast<T *>(val & (~mask)), val & mask};
  }

-  bool check_flag(std::memory_order order = std::memory_order_seq_cst) { return ptr.load(order) & mask; }
+  bool check_flag(std::memory_order order = std::memory_order_seq_cst) {
+    return ptr.load(order) & mask;
+  }
 };
--- a/csrc/balance_serve/sched/utils/csv.hpp
+++ b/csrc/balance_serve/sched/utils/csv.hpp
@@ -19,7 +19,7 @@ namespace csv {
 * @param line The CSV line to parse.
 * @return A vector of strings, each representing a field in the CSV line.
 */
-inline std::vector<std::string> parse_csv_line(const std::string& line) {
+inline std::vector<std::string> parse_csv_line(const std::string &line) {
  std::vector<std::string> result;
  std::string field;
  bool in_quotes = false;
@@ -57,7 +57,8 @@ inline std::vector<std::string> parse_csv_line(const std::string& line) {
 * @return A vector of pairs, each containing a column name and a vector of data
 * for that column.
 */
-inline std::vector<std::pair<std::string, std::vector<std::string>>> read_csv(const std::string& filename) {
+inline std::vector<std::pair<std::string, std::vector<std::string>>>
+read_csv(const std::string &filename) {
  std::cout << "Reading CSV file: " << filename << std::endl;
  // Open the file
  std::ifstream file(filename);
@@ -72,7 +73,7 @@ inline std::vector<std::pair<std::string, std::vector<std::string>>> read_csv(co

  // Prepare the result vector with column names
  std::vector<std::pair<std::string, std::vector<std::string>>> result;
-  for (const auto& name : column_names) {
+  for (const auto &name : column_names) {
    result.emplace_back(name, std::vector<std::string>());
  }

@@ -84,7 +85,7 @@ inline std::vector<std::pair<std::string, std::vector<std::string>>> read_csv(co
  // Determine the number of threads to use
  unsigned int num_threads = std::thread::hardware_concurrency();
  if (num_threads == 0)
-    num_threads = 4;  // Default to 4 threads if hardware_concurrency returns 0
+    num_threads = 4; // Default to 4 threads if hardware_concurrency returns 0

  // Calculate chunk start positions based on content size
  std::vector<size_t> chunk_starts;
@@ -100,14 +101,15 @@ inline std::vector<std::pair<std::string, std::vector<std::string>>> read_csv(co
      ++pos;
    }
    if (pos < content_size) {
-      ++pos;  // Skip the newline character
+      ++pos; // Skip the newline character
    }
    chunk_starts.push_back(pos);
  }
  chunk_starts.push_back(content_size);

  // Create threads to parse each chunk
-  std::vector<std::vector<std::vector<std::string>>> thread_results(num_threads);
+  std::vector<std::vector<std::vector<std::string>>> thread_results(
+      num_threads);
  std::vector<std::thread> threads;

  for (unsigned int i = 0; i < num_threads; ++i) {
@@ -133,13 +135,13 @@ inline std::vector<std::pair<std::string, std::vector<std::string>>> read_csv(co
  }

  // Wait for all threads to finish
-  for (auto& t : threads) {
+  for (auto &t : threads) {
    t.join();
  }

  // Combine the results from all threads into the final result
-  for (const auto& local_result : thread_results) {
-    for (const auto& row : local_result) {
+  for (const auto &local_result : thread_results) {
+    for (const auto &row : local_result) {
      for (size_t i = 0; i < row.size(); ++i) {
        if (i < result.size()) {
          result[i].second.push_back(row[i]);
@@ -158,8 +160,9 @@ inline std::vector<std::pair<std::string, std::vector<std::string>>> read_csv(co
 * @param data A vector of pairs, each containing a column name and a vector of
 * data for that column.
 */
-inline void write_csv(const std::string& filename,
-                      const std::vector<std::pair<std::string, std::vector<std::string>>>& data) {
+inline void write_csv(
+    const std::string &filename,
+    const std::vector<std::pair<std::string, std::vector<std::string>>> &data) {
  std::cout << "Writing CSV file: " << filename << std::endl;

  // Open the file for writing
@@ -170,10 +173,10 @@ inline void write_csv(const std::string& filename,

  // Check that all columns have the same number of rows
  if (data.empty()) {
-    return;  // Nothing to write
+    return; // Nothing to write
  }
  size_t num_rows = data[0].second.size();
-  for (const auto& column : data) {
+  for (const auto &column : data) {
    if (column.second.size() != num_rows) {
      throw std::runtime_error("All columns must have the same number of rows");
    }
@@ -191,7 +194,7 @@ inline void write_csv(const std::string& filename,
  // Write the data rows
  for (size_t row = 0; row < num_rows; ++row) {
    for (size_t col = 0; col < data.size(); ++col) {
-      const std::string& field = data[col].second[row];
+      const std::string &field = data[col].second[row];
      // Handle CSV escaping
      std::string escaped_field = field;
      bool needs_quotes = false;
@@ -204,7 +207,8 @@ inline void write_csv(const std::string& filename,
          pos += 2;
        }
      }
-      if (escaped_field.find(',') != std::string::npos || escaped_field.find('\n') != std::string::npos) {
+      if (escaped_field.find(',') != std::string::npos ||
+          escaped_field.find('\n') != std::string::npos) {
        needs_quotes = true;
      }
      if (needs_quotes) {
@@ -220,6 +224,6 @@ inline void write_csv(const std::string& filename,
  }
 }

-}  // namespace csv
+} // namespace csv

-#endif  // CSV_READER_HPP
+#endif // CSV_READER_HPP
--- a/csrc/balance_serve/sched/utils/easy_format.hpp
+++ b/csrc/balance_serve/sched/utils/easy_format.hpp
@@ -2,15 +2,14 @@
 #include <string>
 #include <vector>

-template <typename T>
-std::string format_vector(const std::vector<T>& v) {
+template <typename T> std::string format_vector(const std::vector<T> &v) {
  std::ostringstream oss;
  if (v.empty())
    return "[]";
  for (size_t i = 0; i < v.size(); ++i) {
    oss << v[i];
    if (i < v.size() - 1)
-      oss << ", ";  // 逗号分隔
+      oss << ", "; // 逗号分隔
  }
  return oss.str();
 }
--- a/csrc/balance_serve/sched/utils/mpsc.hpp
+++ b/csrc/balance_serve/sched/utils/mpsc.hpp
@@ -4,32 +4,31 @@
 #include <optional>
 #include <semaphore>

-template <typename T>
-class MPSCQueue {
+template <typename T> class MPSCQueue {
  struct Node {
    T data;
-    std::atomic<Node*> next;
+    std::atomic<Node *> next;

    Node() : next(nullptr) {}
    Node(T data_) : data(std::move(data_)), next(nullptr) {}
  };

-  std::atomic<Node*> head;
-  Node* tail;
+  std::atomic<Node *> head;
+  Node *tail;

- public:
+public:
  std::atomic_size_t enqueue_count = 0;
  size_t dequeue_count = 0;
  MPSCQueue() {
-    Node* dummy = new Node();
+    Node *dummy = new Node();
    head.store(dummy, std::memory_order_seq_cst);
    tail = dummy;
  }

  ~MPSCQueue() {
-    Node* node = tail;
+    Node *node = tail;
    while (node) {
-      Node* next = node->next.load(std::memory_order_seq_cst);
+      Node *next = node->next.load(std::memory_order_seq_cst);
      delete node;
      node = next;
    }
@@ -38,14 +37,14 @@ class MPSCQueue {
  // 生产者调用
  void enqueue(T data) {
    enqueue_count.fetch_add(1);
-    Node* node = new Node(std::move(data));
-    Node* prev_head = head.exchange(node, std::memory_order_seq_cst);
+    Node *node = new Node(std::move(data));
+    Node *prev_head = head.exchange(node, std::memory_order_seq_cst);
    prev_head->next.store(node, std::memory_order_seq_cst);
  }

  // 消费者调用
  std::optional<T> dequeue() {
-    Node* next = tail->next.load(std::memory_order_seq_cst);
+    Node *next = tail->next.load(std::memory_order_seq_cst);
    if (next) {
      T res = std::move(next->data);
      delete tail;
@@ -59,16 +58,16 @@ class MPSCQueue {
  size_t size() { return enqueue_count.load() - dequeue_count; }
 };

-template <typename T>
-class MPSCQueueConsumerLock {
+template <typename T> class MPSCQueueConsumerLock {
  MPSCQueue<T> queue;
  std::counting_semaphore<> sema{0};

- public:
+public:
  void enqueue(T data) {
    queue.enqueue(std::move(data));
-    // std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this because the memory order might be wrong, I
-    // am also not that sure about this.
+    // std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this
+    // because the memory order might be wrong, I am also not that sure about
+    // this.
    sema.release();
  }

@@ -76,8 +75,10 @@ class MPSCQueueConsumerLock {
    auto re = queue.dequeue();
    if (re.has_value()) {
      while (sema.try_acquire() == false) {
-        std::cerr << __FILE__ << ":" << __FUNCTION__ << " sema try acquire should be success, retrying, please check"
-                  << std::endl;
+        std::cerr
+            << __FILE__ << ":" << __FUNCTION__
+            << " sema try acquire should be success, retrying, please check"
+            << std::endl;
        // assert(false);
      }
      return re.value();
@@ -91,8 +92,10 @@ class MPSCQueueConsumerLock {
    auto re = queue.dequeue();
    if (re.has_value()) {
      while (sema.try_acquire() == false) {
-        std::cerr << __FILE__ << ":" << __FUNCTION__ << " sema try acquire should be success, retrying, please check"
-                  << std::endl;
+        std::cerr
+            << __FILE__ << ":" << __FUNCTION__
+            << " sema try acquire should be success, retrying, please check"
+            << std::endl;
        // assert(false);
      }
      return re.value();

--- a/csrc/balance_serve/sched/utils/statistics.hpp
+++ b/csrc/balance_serve/sched/utils/statistics.hpp
@@ -7,59 +7,71 @@
 #include <unordered_map>

 class Statistics {
- public:
+public:
  // Increment the counter for a given key by a specified value (default is 1)
-  void increment_counter(const std::string& key, int64_t value = 1) { counters_[key] += value; }
+  void increment_counter(const std::string &key, int64_t value = 1) {
+    counters_[key] += value;
+  }

-  int64_t& get_counter(const std::string& key) { return counters_[key]; }
+  int64_t &get_counter(const std::string &key) { return counters_[key]; }

  // Start the timer for a given key
-  void start_timer(const std::string& key) { active_timers_[key] = std::chrono::high_resolution_clock::now(); }
+  void start_timer(const std::string &key) {
+    active_timers_[key] = std::chrono::high_resolution_clock::now();
+  }

  // Stop the timer for a given key and update the total time and count
-  void stop_timer(const std::string& key) {
+  void stop_timer(const std::string &key) {
    auto start_it = active_timers_.find(key);
    if (start_it != active_timers_.end()) {
-      auto duration = std::chrono::high_resolution_clock::now() - start_it->second;
+      auto duration =
+          std::chrono::high_resolution_clock::now() - start_it->second;
      timings_[key].total_time += duration;
      timings_[key].count += 1;
      active_timers_.erase(start_it);
    } else {
      // Handle error: stop_timer called without a matching start_timer
-      std::cerr << "Warning: stop_timer called for key '" << key << "' without a matching start_timer.\n";
+      std::cerr << "Warning: stop_timer called for key '" << key
+                << "' without a matching start_timer.\n";
    }
  }

  // Print out the collected statistical information
  void report() const {
    std::cout << "Counters:\n";
-    for (const auto& kv : counters_) {
+    for (const auto &kv : counters_) {
      std::cout << "  " << kv.first << ": " << kv.second << "\n";
    }
    std::cout << "\nTimers:\n";
-    for (const auto& kv : timings_) {
+    for (const auto &kv : timings_) {
      std::cout << "  " << kv.first << ": count = " << kv.second.count
                << ", total_time = " << kv.second.total_time.count() << "s"
-                << ", average_time = " << (kv.second.count > 0 ? kv.second.total_time.count() / kv.second.count : 0)
+                << ", average_time = "
+                << (kv.second.count > 0
+                        ? kv.second.total_time.count() / kv.second.count
+                        : 0)
                << "s\n";
    }
  }

- private:
+private:
  // Mapping from key to counter
  std::unordered_map<std::string, int64_t> counters_;

  // Struct to hold timing information for a key
  struct TimingInfo {
    int64_t count = 0;
-    std::chrono::duration<double> total_time = std::chrono::duration<double>::zero();
+    std::chrono::duration<double> total_time =
+        std::chrono::duration<double>::zero();
  };

  // Mapping from key to timing information
  std::unordered_map<std::string, TimingInfo> timings_;

  // Mapping from key to the start time of active timers
-  std::unordered_map<std::string, std::chrono::high_resolution_clock::time_point> active_timers_;
+  std::unordered_map<std::string,
+                     std::chrono::high_resolution_clock::time_point>
+      active_timers_;
 };

-#endif  // STATISTICS_HPP
+#endif // STATISTICS_HPP
--- a/csrc/balance_serve/sched/utils/timer.hpp
+++ b/csrc/balance_serve/sched/utils/timer.hpp
 #pragma once
+#include "readable_number.hpp"
 #include <cassert>
 #include <chrono>
 #include <iomanip>
@@ -6,7 +7,6 @@
 #include <map>
 #include <sstream>
 #include <string>
-#include "readable_number.hpp"

 inline std::string doubleToStringR2(double value) {
  std::stringstream stream;
@@ -15,7 +15,7 @@ inline std::string doubleToStringR2(double value) {
 }

 class Timer {
- public:
+public:
  std::string name;
  bool tmp_timer = false;

@@ -49,10 +49,14 @@ class Timer {
      endTime = m_endTime;
    }

-    return std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - m_startTime).count();
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(endTime -
+                                                                m_startTime)
+        .count();
  }

-  void printElapsedMilliseconds() { std::cout << elapsedNs() / 1e6 << " ms" << std::endl; }
+  void printElapsedMilliseconds() {
+    std::cout << elapsedNs() / 1e6 << " ms" << std::endl;
+  }

  static std::string ns_to_string(double duration) {
    auto nano_sec = duration;
@@ -100,13 +104,13 @@ class Timer {
    return readable_number(ops) + "op/s";
  }

-  void merge(Timer& other) {
+  void merge(Timer &other) {
    assert(m_isRunning == false);
    assert(other.m_isRunning == false);
    m_runningNs += other.runningTimeNs();
  }

- private:
+private:
  std::chrono::time_point<std::chrono::high_resolution_clock> m_startTime;
  std::chrono::time_point<std::chrono::high_resolution_clock> m_endTime;
  bool m_isRunning = false;
@@ -114,14 +118,14 @@ class Timer {
 };

 class Counter {
- public:
+public:
  Counter() {}

  std::map<std::string, size_t> counters;

-  void inc(const char* name, size_t num) { counters[name] += num; };
+  void inc(const char *name, size_t num) { counters[name] += num; };
  void print() {
-    for (auto& p : counters) {
+    for (auto &p : counters) {
      std::cout << p.first << " : " << p.second << std::endl;
    }
  };

--- a/ktransformers/configs/model_configs.json
+++ b/ktransformers/configs/model_configs.json
-{
-    "DeepSeek-Coder-V2-Instruct": {
-        "hidden_size": 5120,
-        "intermediate_size": 12288,
-        "max_position_embeddings": 163840,
-        "model_type": "deepseek_v2",
-        "num_attention_heads": 128,
-        "num_hidden_layers": 60,
-        "num_key_value_heads": 128,
-        "vocab_size": 102400
-    },
-    "DeepSeek-R1": {
-        "hidden_size": 7168,
-        "intermediate_size": 18432,
-        "max_position_embeddings": 163840,
-        "model_type": "deepseek_v3",
-        "num_attention_heads": 128,
-        "num_hidden_layers": 61,
-        "num_key_value_heads": 128,
-        "vocab_size": 129280
-    },
-    "DeepSeek-V2-Lite-Chat": {
-        "hidden_size": 2048,
-        "intermediate_size": 10944,
-        "max_position_embeddings": 163840,
-        "model_type": "deepseek_v2",
-        "num_attention_heads": 16,
-        "num_hidden_layers": 27,
-        "num_key_value_heads": 16,
-        "vocab_size": 102400
-    },
-    "DeepSeek-V3": {
-        "hidden_size": 7168,
-        "intermediate_size": 18432,
-        "max_position_embeddings": 163840,
-        "model_type": "deepseek_v3",
-        "num_attention_heads": 128,
-        "num_hidden_layers": 3,
-        "num_key_value_heads": 128,
-        "vocab_size": 129280
-    },
-    "DeepSeek-V3-bf16": {
-        "hidden_size": 7168,
-        "intermediate_size": 18432,
-        "max_position_embeddings": 163840,
-        "model_type": "deepseek_v3",
-        "num_attention_heads": 128,
-        "num_hidden_layers": 61,
-        "num_key_value_heads": 128,
-        "vocab_size": 129280
-    },
-    "LLaMA-2-7B-32K": {
-        "hidden_size": 4096,
-        "intermediate_size": 11008,
-        "max_position_embeddings": 32768,
-        "model_type": "llama",
-        "num_attention_heads": 32,
-        "num_hidden_layers": 32,
-        "num_key_value_heads": 32,
-        "vocab_size": 32000
-    },
-    "Moonlight-16B-A3B-Instruct": {
-        "hidden_size": 2048,
-        "intermediate_size": 11264,
-        "max_position_embeddings": 8192,
-        "model_type": "deepseek_v3",
-        "num_attention_heads": 16,
-        "num_hidden_layers": 27,
-        "num_key_value_heads": 16,
-        "vocab_size": 163840
-    },
-    "Qwen2.5-32B-Instruct": {
-        "hidden_size": 5120,
-        "intermediate_size": 27648,
-        "max_position_embeddings": 32768,
-        "model_type": "qwen2",
-        "num_attention_heads": 40,
-        "num_hidden_layers": 64,
-        "num_key_value_heads": 8,
-        "vocab_size": 152064
-    },
-    "Qwen2.5-32B-Instruct-GPTQ-Int4": {
-        "hidden_size": 5120,
-        "intermediate_size": 27648,
-        "max_position_embeddings": 32768,
-        "model_type": "qwen2",
-        "num_attention_heads": 40,
-        "num_hidden_layers": 64,
-        "num_key_value_heads": 8,
-        "vocab_size": 152064
-    },
-    "Qwen2.5-7B-Instruct": {
-        "hidden_size": 3584,
-        "intermediate_size": 18944,
-        "max_position_embeddings": 32768,
-        "model_type": "qwen2",
-        "num_attention_heads": 28,
-        "num_hidden_layers": 28,
-        "num_key_value_heads": 4,
-        "vocab_size": 152064
-    },
-    "Qwen2.5-7B-Instruct-GPTQ-Int4": {
-        "hidden_size": 3584,
-        "intermediate_size": 18944,
-        "max_position_embeddings": 32768,
-        "model_type": "qwen2",
-        "num_attention_heads": 28,
-        "num_hidden_layers": 28,
-        "num_key_value_heads": 4,
-        "vocab_size": 152064
-    },
-    "qwen2-72b-instruct": {
-        "hidden_size": 8192,
-        "intermediate_size": 29568,
-        "max_position_embeddings": 32768,
-        "model_type": "qwen2",
-        "num_attention_heads": 64,
-        "num_hidden_layers": 80,
-        "num_key_value_heads": 8,
-        "vocab_size": 152064
-    }
-}
\ No newline at end of file
--- a/ktransformers/configs/quant_configs.json
+++ b/ktransformers/configs/quant_configs.json
-{
-    "BF16": {
-        "block_element_count": 1,
-        "block_element_size": 2,
-        "bytes_per_element": 2.0,
-        "can_be_used_as_vector": true,
-        "has_min": false,
-        "has_scale": false,
-        "name": "BF16",
-        "reference": "",
-        "type_of_dot_vector": "BF16"
-    },
-    "FP16": {
-        "block_element_count": 1,
-        "block_element_size": 2,
-        "bytes_per_element": 2.0,
-        "can_be_used_as_vector": true,
-        "has_min": false,
-        "has_scale": false,
-        "name": "FP16",
-        "reference": "",
-        "type_of_dot_vector": "FP16"
-    },
-    "FP32": {
-        "block_element_count": 1,
-        "block_element_size": 4,
-        "bytes_per_element": 4.0,
-        "can_be_used_as_vector": true,
-        "has_min": false,
-        "has_scale": false,
-        "name": "FP32",
-        "reference": "",
-        "type_of_dot_vector": "FP32"
-    },
-    "Q4_0": {
-        "block_element_count": 32,
-        "block_element_size": 18,
-        "bytes_per_element": 0.5625,
-        "can_be_used_as_vector": false,
-        "has_min": false,
-        "has_scale": true,
-        "name": "Q4_0",
-        "reference": "https://huggingface.co/docs/hub/gguf",
-        "type_of_dot_vector": "Q8_0"
-    },
-    "Q8_0": {
-        "block_element_count": 32,
-        "block_element_size": 34,
-        "bytes_per_element": 1.0625,
-        "can_be_used_as_vector": true,
-        "has_min": false,
-        "has_scale": true,
-        "name": "Q8_0",
-        "reference": "https://huggingface.co/docs/hub/gguf",
-        "type_of_dot_vector": "Q8_0"
-    }
-}
\ No newline at end of file
--- a/ktransformers/server/args.py
+++ b/ktransformers/server/args.py
@@ -70,6 +70,9 @@ class ArgumentParser:
        parser.add_argument("--batch_size", type=int, default=self.cfg.batch_size)
        parser.add_argument("--cache_lens", type=int, default=self.cfg.cache_lens)

+        # kvc2 config
+        parser.add_argument("--kvc2_config_dir", type=str, default=self.cfg.kvc2_config_dir)
+
        # log configs
        # log level: debug, info, warn, error, crit
        parser.add_argument("--log_dir", type=str, default=self.cfg.log_dir)

--- a/ktransformers/server/balance_serve/settings.py
+++ b/ktransformers/server/balance_serve/settings.py
@@ -7,9 +7,7 @@ import sys, os
 import yaml, json
 from time import sleep

-current_dir = os.path.dirname(__file__)
-# sched_path = os.path.abspath(os.path.join(current_dir, '../../../build/balance_serve/sched'))
-# sys.path.insert(0, sched_path)
+
 import sched_ext
 from transformers import AutoConfig

@@ -52,8 +50,7 @@ def create_sched_settings(args):
    settings.v_cache_on = False

    settings.kvc2_root_path = '/mnt/data/persist-kvc'
-    settings.kvc2_config_path = os.path.join(current_dir, "..", "..", "configs")
-    print(os.path.join(current_dir, "..", "..", "configs"))
+    settings.kvc2_config_path = args.kvc2_config_dir
    settings.memory_pool_size_GB = args.cpu_memory_size_GB
    settings.evict_count = 40
    settings.kvc2_metrics_port = args.kvc2_metrics_port

--- a/ktransformers/server/config/config.py
+++ b/ktransformers/server/config/config.py
@@ -34,12 +34,15 @@ class Config(metaclass=Singleton):

        user_path: str = os.path.expanduser("~")
        localstore_path: str = os.path.join(user_path, ".ktransformers")
+        kvc2_config_dir = os.path.join(localstore_path, "kvc2")
        config_path: str = os.path.join(localstore_path, Config.CONFIG_FILE_NAME)
        if not os.path.exists(config_yaml):
            print(f"Can't find config file, {config_yaml}")
            exit(-1)
        if not os.path.exists(localstore_path):
            os.mkdir(localstore_path)
+        if not os.path.exists(kvc2_config_dir):
+            os.mkdir(kvc2_config_dir)
        if not os.path.exists(config_path):
            shutil.copyfile(config_yaml, config_path)
        with open(config_path, "r", encoding="utf-8") as fp:
@@ -62,10 +65,13 @@ class Config(metaclass=Singleton):
        self.localstore_path: str = os.path.join(self.user_path, ".ktransformers")
        # log configs
        self.log_dir = os.path.join(self.localstore_path, cfg["log"]["dir"])
+        if not os.path.exists(self.log_dir):
+            os.mkdir(self.log_dir)
        self.log_file = cfg["log"]["file"]
        self.log_level = cfg["log"]["level"]
        self.backup_count = cfg["log"]["backup_count"]

+        self.kvc2_config_dir = os.path.join(self.localstore_path, "kvc2")
        # server configs
        self.server: dict = cfg.get("server", {})
        self.server_ip = self.server.get("ip", "0.0.0.0")