issue/1031 T1-1-15

5f329d7a · PanZezhong · b2660e66 · 5f329d7a · 5f329d7a · 5f329d7a
Commit 5f329d7a authored Mar 11, 2026 by PanZezhong
20 changed files
--- a/include/infinicore/common/hash.hpp
+++ b/include/infinicore/common/hash.hpp
@@ -16,6 +16,11 @@ hash_combine(size_t &seed, const T &value) {

 // Specialization for Tensor
 inline void hash_combine(size_t &seed, Tensor tensor) {
+    if (!tensor) {
+        hash_combine(seed, static_cast<size_t>(0));
+        return;
+    }
+
    hash_combine(seed, static_cast<size_t>(tensor->dtype()));
    for (Size shape : tensor->shape()) {
        hash_combine(seed, shape);

--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -2,9 +2,13 @@

 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
+#include "ops/addcmul.hpp"
+#include "ops/atanh.hpp"
 #include "ops/attention.hpp"
 #include "ops/avg_pool1d.hpp"
+#include "ops/binary_cross_entropy_with_logits.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/cdist.hpp"
 #include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
@@ -18,6 +22,7 @@
 #include "ops/paged_caching.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
+#include "ops/reciprocal.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"

--- a/include/infinicore/ops/addcmul.hpp
+++ b/include/infinicore/ops/addcmul.hpp
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Addcmul {
+public:
+    // schema: out, input, t1, t2, value
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float);
+    static void execute(Tensor out, Tensor input, Tensor t1, Tensor t2, float value);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor addcmul(Tensor input, Tensor t1, Tensor t2, float value);
+void addcmul_(Tensor out, Tensor input, Tensor t1, Tensor t2, float value);
+} // namespace infinicore::op
--- a/include/infinicore/ops/atanh.hpp
+++ b/include/infinicore/ops/atanh.hpp
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Atanh {
+public:
+    // schema 定义为：void(输出 Tensor, 输入 Tensor)
+    using schema = void (*)(Tensor, Tensor);
+
+    // 执行函数
+    static void execute(Tensor y, Tensor a);
+
+    // 获取算子分发器，用于多后端（CPU/CUDA 等）匹配
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+/**
+ * @brief 计算输入 Tensor 的反双曲正切值 (out-of-place)
+ * @param a 输入 Tensor
+ * @return 包含结果的新 Tensor
+ */
+Tensor atanh(Tensor a);
+
+/**
+ * @brief 计算输入 Tensor 的反双曲正切值 (in-place / specified output)
+ * @param y 输出 Tensor
+ * @param a 输入 Tensor
+ */
+void atanh_(Tensor y, Tensor a);
+
+} // namespace infinicore::op
--- a/include/infinicore/ops/binary_cross_entropy_with_logits.hpp
+++ b/include/infinicore/ops/binary_cross_entropy_with_logits.hpp
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <string>
+
+namespace infinicore::op {
+
+class BinaryCrossEntropyWithLogits {
+public:
+    /**
+     * @brief BCEWithLogits 算子的函数原型
+     * 参数顺序: out, logits, target, weight, pos_weight, reduction
+     */
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, std::string);
+
+    static void execute(Tensor out,
+                        Tensor logits,
+                        Tensor target,
+                        Tensor weight,
+                        Tensor pos_weight,
+                        std::string reduction);
+
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+/**
+ * @brief 非原地操作接口 (Out-of-place)
+ */
+Tensor binary_cross_entropy_with_logits(Tensor logits,
+                                        Tensor target,
+                                        Tensor weight = {},
+                                        Tensor pos_weight = {},
+                                        std::string reduction = "mean");
+
+/**
+ * @brief 显式指定输出张量的接口
+ */
+void binary_cross_entropy_with_logits_(Tensor out,
+                                       Tensor logits,
+                                       Tensor target,
+                                       Tensor weight,
+                                       Tensor pos_weight,
+                                       std::string reduction);
+
+} // namespace infinicore::op
--- a/include/infinicore/ops/cdist.hpp
+++ b/include/infinicore/ops/cdist.hpp
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Cdist {
+public:
+    /**
+     * @brief 成对距离计算算子 (Pairwise distance)
+     * schema: out (M, N), x1 (M, D), x2 (N, D), p (norm degree)
+     */
+    using schema = void (*)(Tensor, Tensor, Tensor, double);
+
+    static void execute(Tensor out, Tensor x1, Tensor x2, double p);
+
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+/**
+ * @brief 非原地（Out-of-place）接口
+ * @return 返回形状为 (M, N) 的新 Tensor
+ */
+Tensor cdist(Tensor x1, Tensor x2, double p = 2.0);
+
+/**
+ * @brief 显式指定输出接口
+ */
+void cdist_(Tensor out, Tensor x1, Tensor x2, double p = 2.0);
+
+} // namespace infinicore::op
--- a/include/infinicore/ops/reciprocal.hpp
+++ b/include/infinicore/ops/reciprocal.hpp
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Reciprocal {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor y, Tensor x);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor reciprocal(Tensor x);
+void reciprocal_(Tensor y, Tensor x);
+} // namespace infinicore::op
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -4,16 +4,25 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/add_rms_norm.h"
+#include "infiniop/ops/addcmul.h"
 #include "infiniop/ops/all.h"
+#include "infiniop/ops/atanh.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/avg_pool1d.h"
+#include "infiniop/ops/binary_cross_entropy_with_logits.h"
 #include "infiniop/ops/causal_softmax.h"
+#include "infiniop/ops/cdist.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/cross_entropy.h"
 #include "infiniop/ops/dequantize_awq.h"
 #include "infiniop/ops/embedding.h"
+#include "infiniop/ops/equal.h"
 #include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/hardswish.h"
+#include "infiniop/ops/hardtanh.h"
 #include "infiniop/ops/int8_gemm.h"
 #include "infiniop/ops/kv_caching.h"
 #include "infiniop/ops/layer_norm.h"
@@ -27,6 +36,7 @@
 #include "infiniop/ops/quant/per_channel_quant_int8.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
+#include "infiniop/ops/reciprocal.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
@@ -47,10 +57,4 @@
 #include "infiniop/ops/zeros.h"
 #include "infiniop/tensor_descriptor.h"

-#include "infiniop/ops/cross_entropy.h"
-#include "infiniop/ops/hardswish.h"
-#include "infiniop/ops/avg_pool1d.h"
-#include "infiniop/ops/equal.h"
-#include "infiniop/ops/hardtanh.h"
-
 #endif // __INFINIOP_API_H__
--- a/include/infiniop/ops/addcmul.h
+++ b/include/infiniop/ops/addcmul.h
+#ifndef __INFINIOP_ADDCMUL_API_H__
+#define __INFINIOP_ADDCMUL_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义 addcmul 算子描述符类型
+typedef struct InfiniopDescriptor *infiniopAddcmulDescriptor_t;
+
+/**
+ * @brief 创建 Addcmul 算子描述符
+ * @param handle 算子句柄
+ * @param desc_ptr 指向返回的描述符指针
+ * @param out 输出张量描述符
+ * @param input 加项张量描述符
+ * @param tensor1 乘项张量1描述符
+ * @param tensor2 乘项张量2描述符
+ * @param value 乘积的标量系数
+ */
+__INFINI_C __export infiniStatus_t infiniopCreateAddcmulDescriptor(infiniopHandle_t handle,
+                                                                   infiniopAddcmulDescriptor_t *desc_ptr,
+                                                                   infiniopTensorDescriptor_t out,
+                                                                   infiniopTensorDescriptor_t input,
+                                                                   infiniopTensorDescriptor_t tensor1,
+                                                                   infiniopTensorDescriptor_t tensor2,
+                                                                   float value);
+
+/**
+ * @brief 获取 Addcmul 计算所需的临时空间大小
+ */
+__INFINI_C __export infiniStatus_t infiniopGetAddcmulWorkspaceSize(infiniopAddcmulDescriptor_t desc, size_t *size);
+
+/**
+ * @brief 执行 Addcmul 计算
+ * @param desc 算子描述符
+ * @param workspace 临时空间指针
+ * @param workspace_size 临时空间大小
+ * @param out 输出数据指针
+ * @param input 加项数据指针
+ * @param tensor1 乘项1数据指针
+ * @param tensor2 乘项2数据指针
+ * @param stream 计算流 (CUDA stream 等)
+ */
+__INFINI_C __export infiniStatus_t infiniopAddcmul(infiniopAddcmulDescriptor_t desc,
+                                                   void *workspace,
+                                                   size_t workspace_size,
+                                                   void *out,
+                                                   const void *input,
+                                                   const void *tensor1,
+                                                   const void *tensor2,
+                                                   void *stream);
+
+/**
+ * @brief 销毁 Addcmul 算子描述符
+ */
+__INFINI_C __export infiniStatus_t infiniopDestroyAddcmulDescriptor(infiniopAddcmulDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/atanh.h
+++ b/include/infiniop/ops/atanh.h
+#ifndef __INFINIOP_Atanh_API_H__
+#define __INFINIOP_Atanh_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle,
+                                                                 infiniopAtanhDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t y,
+                                                                 infiniopTensorDescriptor_t a);
+
+__INFINI_C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *y,
+                                                 const void *a,
+                                                 void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/binary_cross_entropy_with_logits.h
+++ b/include/infiniop/ops/binary_cross_entropy_with_logits.h
+#ifndef __INFINIOP_BINARY_CROSS_ENTROPY_WITH_LOGITS_API_H__
+#define __INFINIOP_BINARY_CROSS_ENTROPY_WITH_LOGITS_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义归约方式枚举
+typedef enum {
+    INFINIOP_REDUCTION_NONE = 0,
+    INFINIOP_REDUCTION_MEAN = 1,
+    INFINIOP_REDUCTION_SUM = 2
+} infiniopReduction_t;
+
+// 定义 BCEWithLogits 算子描述符类型
+typedef struct InfiniopDescriptor *infiniopBCEWithLogitsDescriptor_t;
+
+/**
+ * @brief 创建 BCEWithLogits 算子描述符
+ * @param handle 算子句柄
+ * @param desc_ptr 指向返回的描述符指针
+ * @param out 输出张量描述符 (none时与input同形状，mean/sum时为标量)
+ * @param logits 输入 Logits 张量描述符
+ * @param target 目标标签张量描述符
+ * @param weight 样本权重描述符 (可选，不需要则传 NULL)
+ * @param pos_weight 正样本权重描述符 (可选，不需要则传 NULL)
+ * @param reduction 归约方式 (none, mean, sum)
+ */
+__INFINI_C __export infiniStatus_t infiniopCreateBCEWithLogitsDescriptor(
+    infiniopHandle_t handle,
+    infiniopBCEWithLogitsDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out,
+    infiniopTensorDescriptor_t logits,
+    infiniopTensorDescriptor_t target,
+    infiniopTensorDescriptor_t weight,
+    infiniopTensorDescriptor_t pos_weight,
+    infiniopReduction_t reduction);
+
+/**
+ * @brief 获取 BCEWithLogits 计算所需的临时空间大小
+ */
+__INFINI_C __export infiniStatus_t infiniopGetBCEWithLogitsWorkspaceSize(
+    infiniopBCEWithLogitsDescriptor_t desc,
+    size_t *size);
+
+/**
+ * @brief 执行 BCEWithLogits 计算
+ * @param desc 算子描述符
+ * @param workspace 临时空间指针
+ * @param workspace_size 临时空间大小
+ * @param out 输出数据指针
+ * @param logits Logits 数据指针
+ * @param target Target 数据指针
+ * @param weight 权重数据指针 (可选，传 NULL 表示权重全为 1)
+ * @param pos_weight 正样本权重数据指针 (可选，传 NULL 表示权重全为 1)
+ * @param stream 计算流
+ */
+__INFINI_C __export infiniStatus_t infiniopBCEWithLogits(
+    infiniopBCEWithLogitsDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *logits,
+    const void *target,
+    const void *weight,
+    const void *pos_weight,
+    void *stream);
+
+/**
+ * @brief 销毁 BCEWithLogits 算子描述符
+ */
+__INFINI_C __export infiniStatus_t infiniopDestroyBCEWithLogitsDescriptor(
+    infiniopBCEWithLogitsDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/cdist.h
+++ b/include/infiniop/ops/cdist.h
+#ifndef __INFINIOP_CDIST_API_H__
+#define __INFINIOP_CDIST_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义 cdist 算子描述符类型
+typedef struct InfiniopDescriptor *infiniopCdistDescriptor_t;
+
+/**
+ * @brief 创建 Cdist 算子描述符
+ * @param handle 算子句柄
+ * @param desc_ptr 指向返回的描述符指针
+ * @param y 输出张量描述符 (Shape: M x N)
+ * @param x1 输入张量1描述符 (Shape: M x D)
+ * @param x2 输入张量2描述符 (Shape: N x D)
+ * @param p 范数阶数 (L-p norm)
+ */
+__INFINI_C __export infiniStatus_t infiniopCreateCdistDescriptor(
+    infiniopHandle_t handle,
+    infiniopCdistDescriptor_t *desc_ptr, // 注意这里应该是具体类型的指针
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p);
+
+/**
+ * @brief 获取 Cdist 计算所需的临时空间大小
+ */
+__INFINI_C __export infiniStatus_t infiniopGetCdistWorkspaceSize(infiniopCdistDescriptor_t desc,
+                                                                 size_t *size);
+
+/**
+ * @brief 执行 Cdist 计算
+ * @param desc 算子描述符
+ * @param workspace 临时空间指针
+ * @param workspace_size 临时空间大小
+ * @param y 输出数据指针
+ * @param x1 输入1数据指针
+ * @param x2 输入2数据指针
+ * @param stream 计算流 (CUDA stream 等)
+ */
+__INFINI_C __export infiniStatus_t infiniopCdist(
+    infiniopCdistDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream);
+
+/**
+ * @brief 销毁 Cdist 算子描述符
+ */
+__INFINI_C __export infiniStatus_t infiniopDestroyCdistDescriptor(infiniopCdistDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/reciprocal.h
+++ b/include/infiniop/ops/reciprocal.h
+#ifndef __INFINIOP_RECIPROCAL_API_H__
+#define __INFINIOP_RECIPROCAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle,
+                                                                      infiniopReciprocalDescriptor_t *desc_ptr,
+                                                                      infiniopTensorDescriptor_t y,
+                                                                      infiniopTensorDescriptor_t x);
+
+__INFINI_C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc,
+                                                      void *workspace,
+                                                      size_t workspace_size,
+                                                      void *y,
+                                                      const void *x,
+                                                      void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc);
+
+#endif
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -49,8 +49,14 @@ from infinicore.dtype import (
 )
 from infinicore.ops.add import add
 from infinicore.ops.add_rms_norm import add_rms_norm
+from infinicore.ops.addcmul import addcmul
 from infinicore.ops.all import all
+from infinicore.ops.atanh import atanh
 from infinicore.ops.attention import attention
+from infinicore.ops.binary_cross_entropy_with_logits import (
+    binary_cross_entropy_with_logits,
+)
+from infinicore.ops.cdist import cdist
 from infinicore.ops.cross_entropy import cross_entropy
 from infinicore.ops.equal import equal
 from infinicore.ops.kv_caching import kv_caching
@@ -62,6 +68,7 @@ from infinicore.ops.paged_attention import paged_attention
 from infinicore.ops.paged_attention_prefill import paged_attention_prefill
 from infinicore.ops.paged_caching import paged_caching
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.reciprocal import reciprocal
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.sum import sum
 from infinicore.ops.topk import topk
@@ -125,6 +132,11 @@ __all__ = [
    "short",
    "uint8",
    # Operations.
+    "addcmul",
+    "atanh",
+    "binary_cross_entropy_with_logits",
+    "cdist",
+    "reciprocal",
    "add",
    "add_rms_norm",
    "add_rms_norm_",

--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
 from .avg_pool1d import avg_pool1d
+from .binary_cross_entropy_with_logits import binary_cross_entropy_with_logits
 from .causal_softmax import causal_softmax
 from .embedding import embedding
 from .flash_attention import flash_attention
@@ -18,6 +19,7 @@ __all__ = [
    "embedding",
    "flash_attention",
    "linear",
+    "binary_cross_entropy_with_logits",
    "random_sample",
    "rms_norm",
    "RopeAlgo",

--- a/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py
+++ b/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def binary_cross_entropy_with_logits(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = None,
+    pos_weight: Tensor | None = None,
+    reduction: str = "mean",
+    *,
+    out: Tensor | None = None,
+) -> Tensor:
+    """Binary cross entropy loss with logits.
+
+    This wraps the underlying C++/CUDA implementation exposed via `_infinicore`.
+
+    The low-level binding treats missing ``weight`` / ``pos_weight`` via
+    default-constructed tensors. Here we avoid passing ``None`` down and
+    instead omit arguments when they are not provided, so pybind11 uses
+    its defaults.
+    """
+
+    # Out-of-place API
+    if out is None:
+        # Neither weight nor pos_weight
+        if weight is None and pos_weight is None:
+            return Tensor(
+                _infinicore.binary_cross_entropy_with_logits(
+                    input._underlying,
+                    target._underlying,
+                    reduction=reduction,
+                )
+            )
+
+        # weight provided only
+        if weight is not None and pos_weight is None:
+            return Tensor(
+                _infinicore.binary_cross_entropy_with_logits(
+                    input._underlying,
+                    target._underlying,
+                    weight._underlying,
+                    reduction=reduction,
+                )
+            )
+
+        # pos_weight provided only
+        if weight is None and pos_weight is not None:
+            return Tensor(
+                _infinicore.binary_cross_entropy_with_logits(
+                    input._underlying,
+                    target._underlying,
+                    pos_weight=pos_weight._underlying,
+                    reduction=reduction,
+                )
+            )
+
+        # both provided
+        return Tensor(
+            _infinicore.binary_cross_entropy_with_logits(
+                input._underlying,
+                target._underlying,
+                weight._underlying,
+                pos_weight._underlying,
+                reduction,
+            )
+        )
+
+    # In-place-style API with explicit out
+    if weight is None and pos_weight is None:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            reduction=reduction,
+        )
+    elif weight is not None and pos_weight is None:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            weight._underlying,
+            reduction=reduction,
+        )
+    elif weight is None and pos_weight is not None:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            pos_weight=pos_weight._underlying,
+            reduction=reduction,
+        )
+    else:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            weight._underlying,
+            pos_weight._underlying,
+            reduction,
+        )
+
+    return out
--- a/python/infinicore/ops/addcmul.py
+++ b/python/infinicore/ops/addcmul.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def addcmul(input, tensor1, tensor2, value=1.0, *, out=None):
+    if out is None:
+        return Tensor(
+            _infinicore.addcmul(
+                input._underlying,
+                tensor1._underlying,
+                tensor2._underlying,
+                float(value),
+            )
+        )
+
+    _infinicore.addcmul_(
+        out._underlying,
+        input._underlying,
+        tensor1._underlying,
+        tensor2._underlying,
+        float(value),
+    )
+
+    return out
--- a/python/infinicore/ops/atanh.py
+++ b/python/infinicore/ops/atanh.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def atanh(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.atanh(input._underlying))
+
+    _infinicore.atanh_(out._underlying, input._underlying)
+
+    return out
--- a/python/infinicore/ops/binary_cross_entropy_with_logits.py
+++ b/python/infinicore/ops/binary_cross_entropy_with_logits.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def binary_cross_entropy_with_logits(
+    input, target, weight=None, pos_weight=None, reduction="mean", *, out=None
+):
+    """
+    input: Tensor (logits)
+    target: Tensor (labels)
+    weight: Tensor (optional, sample-wise weight)
+    pos_weight: Tensor (optional, class-wise weight)
+    reduction: str ('none', 'mean', 'sum')
+    """
+
+    # 提取底层 C++ 对象，处理可选 Tensor
+    weight_raw = weight._underlying if weight is not None else None
+    pos_weight_raw = pos_weight._underlying if pos_weight is not None else None
+
+    if out is None:
+        # 调用非原地接口，返回新创建的 Tensor
+        return Tensor(
+            _infinicore.binary_cross_entropy_with_logits(
+                input._underlying,
+                target._underlying,
+                weight_raw,
+                pos_weight_raw,
+                str(reduction),
+            )
+        )
+
+    # 调用显式指定输出的接口 (binary_cross_entropy_with_logits_)
+    _infinicore.binary_cross_entropy_with_logits_(
+        out._underlying,
+        input._underlying,
+        target._underlying,
+        weight_raw,
+        pos_weight_raw,
+        str(reduction),
+    )
+
+    return out
--- a/python/infinicore/ops/cdist.py
+++ b/python/infinicore/ops/cdist.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def cdist(x1, x2, p=2.0, *, out=None):
+    """
+    计算两组向量集合中每一对向量之间的 p-norm 距离。
+
+    参数:
+        x1 (Tensor): 形状为 (M, D) 的输入张量。
+        x2 (Tensor): 形状为 (N, D) 的输入张量。
+        p (float): p-norm 的阶数，默认为 2.0。
+        out (Tensor, optional): 结果输出张量。
+
+    返回:
+        Tensor: 形状为 (M, N) 的距离矩阵。
+    """
+    if out is None:
+        # 非原地操作：由底层 C++ 接口根据 x1, x2 推导形状并创建新 Tensor
+        return Tensor(
+            _infinicore.cdist(
+                x1._underlying,
+                x2._underlying,
+                float(p),
+            )
+        )
+
+    # 原地/指定输出操作：结果写入用户提供的 out 张量
+    _infinicore.cdist_(
+        out._underlying,
+        x1._underlying,
+        x2._underlying,
+        float(p),
+    )
+
+    return out