init

d3ad6274 · xuxzh1 · 97b02a89 · 97b02a89 · 97b02a89 · 97b02a89
Commit d3ad6274 authored Nov 12, 2024 by xuxzh1 🎱
13 changed files
--- a/llm/llama.cpp/kompute-shaders/op_rope_f32.comp
+++ b/llm/llama.cpp/kompute-shaders/op_rope_f32.comp
-#version 450
-
-#include "rope_common.comp"
-
-layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-void main() {
-    const uint i3 = gl_WorkGroupID.z;
-    const uint i2 = gl_WorkGroupID.y;
-    const uint i1 = gl_WorkGroupID.x;
-
-    const bool is_neox = (pcs.mode & 2) != 0;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
-
-    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
-
-    const int p = inB[pcs.inBOff + i2];
-
-    float theta = float(p);
-
-    if (!is_neox) {
-        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+1];
-
-            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
-        }
-    } else {
-        const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-            const uint cur_rot = ic;
-
-            float cos_theta, sin_theta;
-            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
-
-            theta *= theta_scale;
-
-            const uint i0 = ic/2;
-
-            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
-
-            const float x0 = inA[src];
-            const float x1 = inA[src+pcs.n_dims/2];
-
-            out_[dst_data] = x0*cos_theta - x1*sin_theta;
-            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        }
-
-        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
-            const uint i0 = ic;
-
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
-
-            out_[dst_data + 0] = inA[src + 0];
-            out_[dst_data + 1] = inA[src + 1];
-        }
-    }
-}
--- a/llm/llama.cpp/kompute-shaders/op_scale.comp
+++ b/llm/llama.cpp/kompute-shaders/op_scale.comp
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    float scale;
-} pcs;
-
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
-}
--- a/llm/llama.cpp/kompute-shaders/op_scale_8.comp
+++ b/llm/llama.cpp/kompute-shaders/op_scale_8.comp
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    float scale;
-} pcs;
-
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 8;
-
-    for (uint x = 0; x < 8; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
-    }
-}
--- a/llm/llama.cpp/kompute-shaders/op_silu.comp
+++ b/llm/llama.cpp/kompute-shaders/op_silu.comp
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-} pcs;
-
-void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
-
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        const float y = in_[i + pcs.inOff];
-        out_[i + pcs.outOff] = y / (1.0 + exp(-y));
-    }
-}
--- a/llm/llama.cpp/kompute-shaders/op_softmax.comp
+++ b/llm/llama.cpp/kompute-shaders/op_softmax.comp
-// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
-
-#version 450
-
-#include "common.comp"
-
-layout(local_size_x_id = 0) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    float scale;
-    int mask;
-} pcs;
-
-void main() {
-    if (gl_SubgroupInvocationID > 31)
-        return;
-
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-
-    const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
-    const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
-    const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
-    const uint pdst = extra_off + pcs.outOff; // Based from out_
-
-    // parallel max
-    float localMax = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
-    }
-    float max_ = subgroupMax(localMax);
-
-    // parallel sum
-    float localSum = 0.0f;
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
-        localSum += exp_psrc0;
-        out_[pdst + i00] = exp_psrc0;
-    }
-
-    const float sum = subgroupAdd(localSum);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
-        out_[pdst + i00] /= sum;
-    }
-}
--- a/llm/llama.cpp/kompute-shaders/rope_common.comp
+++ b/llm/llama.cpp/kompute-shaders/rope_common.comp
-#include "common.comp"
-
-// TODO: use a local size of 32 or more (Metal uses 1024)
-layout(local_size_x = 1) in;
-
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int n_dims;
-    int mode;
-    int n_orig_ctx;
-    float freq_base;
-    float freq_scale;
-    float ext_factor;
-    float attn_factor;
-    float beta_fast;
-    float beta_slow;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
-float rope_yarn_ramp(const float low, const float high, const float i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
-    out float cos_theta, out float sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
-    }
-    cos_theta = cos(theta) * mscale;
-    sin_theta = sin(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
-}
-
-void rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
-) {
-    // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
-}
--- a/llm/llama.cpp/kompute/.ccls
+++ b/llm/llama.cpp/kompute/.ccls
-
-%clang
-
-fdeclspec
-fms-extensions
-Wall
-Wextra
-std=c++17
-
-%h -x
-%h c++-header
-
-DDEBUG=1
-DKOMPUTE_INCLUDE_FOR_SYNTAX
-
-I/usr/include/python3.6/
-I./python/pybind11/include/
-
-I./build/_deps/vulkan_header-src/include/
-I./build/_deps/spdlog-src/include/
-I./build/_deps/googletest-src/googletest/include/
-I./build/_deps/fmt-src/include/
-
-I./src/include/
-I./build/src/shaders/glsl/
-I./build/test/shaders/glsl/
-I./test/utils/
--- a/llm/llama.cpp/kompute/.clang-format
+++ b/llm/llama.cpp/kompute/.clang-format
---
-BasedOnStyle: Mozilla
-IndentWidth: 4
-
-...
--- a/llm/llama.cpp/kompute/.dockerignore
+++ b/llm/llama.cpp/kompute/.dockerignore
-build/*
-examples/*
-docker-builders/
-swiftshader/
--- a/llm/llama.cpp/kompute/.github/workflows/cpp_examples.yml
+++ b/llm/llama.cpp/kompute/.github/workflows/cpp_examples.yml
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  array-multiplication-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/array_multiplication/build
-        source-dir: ${{github.workspace}}/examples/array_multiplication
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/array_multiplication/build/src/kompute_array_mult
-
-  logistc-regression-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/logistic_regression/build
-        source-dir: ${{github.workspace}}/examples/logistic_regression
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/logistic_regression/build/src/kompute_logistic_regression
--- a/llm/llama.cpp/kompute/.github/workflows/cpp_tests.yml
+++ b/llm/llama.cpp/kompute/.github/workflows/cpp_tests.yml
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  cpp-tests-debug-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-release-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-debug-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-  
-  cpp-tests-release-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
--- a/llm/llama.cpp/kompute/.github/workflows/python_tests.yml
+++ b/llm/llama.cpp/kompute/.github/workflows/python_tests.yml
-name: Python Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  python-tests:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: Install Python Requirements
-      run: pip3 install --user -r python/test/requirements-dev.txt
-    - name: Python Build
-      env:
-        KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
-        KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
-      run: pip3 install --user . -v
-    - name: Python run Tests
-      run: |
-        export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
-        make test_python
--- a/llm/llama.cpp/kompute/.gitignore
+++ b/llm/llama.cpp/kompute/.gitignore
-# Compiled source #
-###################
-*.com
-*.class
-*.dll
-*.exe
-*.o
-*.so
-
-# Packages #
-############
-# it's better to unpack these files and commit the raw source
-# git has its own built in compression methods
-*.7z
-*.dmg
-*.gz
-*.iso
-*.jar
-*.rar
-*.tar
-*.zip
-
-# Logs and databases #
-######################
-*.log
-*.sql
-*.sqlite
-
-# OS generated files #
-######################
-.DS_Store
-.DS_Store?
-._*
-.Spotlight-V100
-.Trashes
-ehthumbs.db
-Thumbs.db
-
-# Python
-__pycache__
-*.pyc
-dist/
-kp.egg-info/
-
-
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-
-# Runtime data
-pids
-*.pid
-*.seed
-*.pid.lock
-
-# Directory for instrumented libs generated by jscoverage/JSCover
-lib-cov
-
-# Coverage directory used by tools like istanbul
-coverage
-
-# nyc test coverage
-.nyc_output
-
-# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
-.grunt
-
-# Bower dependency directory (https://bower.io/)
-bower_components
-
-# node-waf configuration
-.lock-wscript
-
-# Compiled binary addons (http://nodejs.org/api/addons.html)
-build/Release
-
-# Dependency directories
-node_modules/
-
-# TAgs
-tags
-tags.*
-
-# Visual Studio 2015 user specific files
-.vs/
-
-# Visual Studio 2015 database file
-*.VC.db
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-*.ipa
-
-# These project files can be generated by the engine
-*.xcodeproj
-*.xcworkspace
-*.sln
-*.suo
-*.opensdf
-*.sdf
-*.VC.db
-*.VC.opendb
-
-# Precompiled Assets
-SourceArt/**/*.png
-SourceArt/**/*.tga
-
-# Binary Files
-Binaries/*
-Plugins/*/Binaries/*
-
-# Builds
-Build/*
-
-# Whitelist PakBlacklist-<BuildConfiguration>.txt files
-!Build/*/
-Build/*/**
-!Build/*/PakBlacklist*.txt
-
-# Don't ignore icon files in Build
-!Build/**/*.ico
-
-# Built data for maps
-*_BuiltData.uasset
-
-# Configuration files generated by the Editor
-Saved/*
-
-# Compiled source files for the engine to use
-Intermediate/*
-Plugins/*/Intermediate/*
-
-# Cache files for the editor to use
-DerivedDataCache/*
-
-# Starter Content Ignored
-
-Content/StarterContent/*
-
-# VSCode Files
-/.vscode/*
-BuildingEscape.code-workspace
-compile_commands.json
-.clangd/
-.cache/
-
-# Project files
-bin/
-external/boost/
-tmp/
-
-# CMake
-build/
-release/
-
-# Kompute #
-###################
-swiftshader/
-vk_swiftshader_icd.json
-tmp_kp_shader.comp.spv
-tmp_kp_shader.comp
-
-# Docs
-_build/
-