infer_engine.hpp 1.9 KB
Newer Older
1
2
#pragma once

Jiacheng Huang's avatar
Jiacheng Huang committed
3
#include "../models/llama/llama_config.hpp"
4
5
6
7
8
9
10
11
12
13
14
#include "distributed/distributed.hpp"
#include "infinicore/tensor.hpp"
#include "rank_worker.hpp"

#include <any>
#include <vector>

namespace infinilm::engine {

class InferEngine {
public:
15
16
17
18
19
20
21
22
23
24
    struct Input {
        infinicore::Tensor input_ids;

        infinicore::Tensor position_ids;
    };

    struct Output {
        infinicore::Tensor logits;
    };

25
    // Updated constructor: accept CacheConfig instead of CacheType
26
    InferEngine(
Jiacheng Huang's avatar
Jiacheng Huang committed
27
        const InfinilmModel::Config &config,
28
        const distributed::DistConfig &distributed_config = distributed::DistConfig(),
29
30
        infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
        const cache::CacheConfig &cache_config = cache::CacheConfig());
31
32
33
34

    // Load a parameter to all workers (each can extract its shard inside RankWorker)
    void load_param(const std::string &name, const infinicore::Tensor &param);

35
    // return the parameters (i.e. weights and biases).
36
    std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> state_dict();
37

38
    // Run a single forward pass on all workers and return the outputs from all ranks
39
    Output forward(const Input &input);
40

41
42
43
44
45
    // Reset the internal cache pos in all workers (clears state between generations)
    void reset_cache(size_t pos = 0);

    // Overload: reset cache with new KV configuration
    void reset_cache(const cache::CacheConfig &new_config, size_t pos = 0);
Ceng's avatar
Ceng committed
46

47
48
49
50
    ~InferEngine();

    const distributed::DistConfig &get_dist_config() const;

51
52
53
    // Get current KV configuration
    const cache::CacheConfig &get_cache_config() const { return cache_config_; }

54
55
56
protected:
    std::vector<std::unique_ptr<RankWorker>> workers_;
    distributed::CommunicationGroup communication_group_;
Jiacheng Huang's avatar
Jiacheng Huang committed
57
    const InfinilmModel::Config &model_config_;
58
    cache::CacheConfig cache_config_;
59
60
61
};

} // namespace infinilm::engine