Commit d3ad6274 authored by xuxzh1's avatar xuxzh1 🎱
Browse files

init

parent 97b02a89
#version 450
#include "rope_common.comp"
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
void main() {
const uint i3 = gl_WorkGroupID.z;
const uint i2 = gl_WorkGroupID.y;
const uint i1 = gl_WorkGroupID.x;
const bool is_neox = (pcs.mode & 2) != 0;
float corr_dims[2];
rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
const int p = inB[pcs.inBOff + i2];
float theta = float(p);
if (!is_neox) {
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
float cos_theta, sin_theta;
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
theta *= theta_scale;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
const float x0 = inA[src];
const float x1 = inA[src+1];
out_[dst_data] = x0*cos_theta - x1*sin_theta;
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
}
} else {
const float inv_ndims = -1.f/pcs.n_dims;
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
const uint cur_rot = ic;
float cos_theta, sin_theta;
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
theta *= theta_scale;
const uint i0 = ic/2;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
const float x0 = inA[src];
const float x1 = inA[src+pcs.n_dims/2];
out_[dst_data] = x0*cos_theta - x1*sin_theta;
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
}
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
const uint i0 = ic;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
out_[dst_data + 0] = inA[src + 0];
out_[dst_data + 1] = inA[src + 1];
}
}
}
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
float scale;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
}
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
float scale;
} pcs;
void main() {
const uint baseIndex = gl_WorkGroupID.x * 8;
for (uint x = 0; x < 8; x++) {
const uint i = baseIndex + x;
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
}
}
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
} pcs;
void main() {
const uint baseIndex = gl_WorkGroupID.x * 4;
for (uint x = 0; x < 4; x++) {
const uint i = baseIndex + x;
const float y = in_[i + pcs.inOff];
out_[i + pcs.outOff] = y / (1.0 + exp(-y));
}
}
// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
#version 450
#include "common.comp"
layout(local_size_x_id = 0) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne01;
int ne02;
float scale;
int mask;
} pcs;
void main() {
if (gl_SubgroupInvocationID > 31)
return;
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
const uint pdst = extra_off + pcs.outOff; // Based from out_
// parallel max
float localMax = uintBitsToFloat(0xFF800000);
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
}
float max_ = subgroupMax(localMax);
// parallel sum
float localSum = 0.0f;
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
localSum += exp_psrc0;
out_[pdst + i00] = exp_psrc0;
}
const float sum = subgroupAdd(localSum);
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
out_[pdst + i00] /= sum;
}
}
#include "common.comp"
// TODO: use a local size of 32 or more (Metal uses 1024)
layout(local_size_x = 1) in;
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int n_dims;
int mode;
int n_orig_ctx;
float freq_base;
float freq_scale;
float ext_factor;
float attn_factor;
float beta_fast;
float beta_slow;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
float rope_yarn_ramp(const float low, const float high, const float i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
}
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
void rope_yarn(
float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
out float cos_theta, out float sin_theta
) {
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap;
float theta = theta_interp;
if (ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
}
cos_theta = cos(theta) * mscale;
sin_theta = sin(theta) * mscale;
}
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
}
void rope_yarn_corr_dims(
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
) {
// start and end correction dims
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
}
%clang
-fdeclspec
-fms-extensions
-Wall
-Wextra
-std=c++17
%h -x
%h c++-header
-DDEBUG=1
-DKOMPUTE_INCLUDE_FOR_SYNTAX
-I/usr/include/python3.6/
-I./python/pybind11/include/
-I./build/_deps/vulkan_header-src/include/
-I./build/_deps/spdlog-src/include/
-I./build/_deps/googletest-src/googletest/include/
-I./build/_deps/fmt-src/include/
-I./src/include/
-I./build/src/shaders/glsl/
-I./build/test/shaders/glsl/
-I./test/utils/
---
BasedOnStyle: Mozilla
IndentWidth: 4
...
build/*
examples/*
docker-builders/
swiftshader/
name: C++ Tests
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
array-multiplication-example:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/examples/array_multiplication/build
source-dir: ${{github.workspace}}/examples/array_multiplication
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
build-options: --parallel # Given we don't build too many resources we can leverage parallel
- name: Run tests
run: ./examples/array_multiplication/build/src/kompute_array_mult
logistc-regression-example:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/examples/logistic_regression/build
source-dir: ${{github.workspace}}/examples/logistic_regression
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
build-options: --parallel # Given we don't build too many resources we can leverage parallel
- name: Run tests
run: ./examples/logistic_regression/build/src/kompute_logistic_regression
name: C++ Tests
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
cpp-tests-debug-with-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests
cpp-tests-release-with-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Release
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests
cpp-tests-debug-without-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Debug
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests
cpp-tests-release-without-debug-layers:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
env:
VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: "[Release g++] Build & Test"
uses: KomputeProject/action-cmake-build@master
with:
build-dir: ${{github.workspace}}/build
source-dir: ${{github.workspace}}
cc: gcc
cxx: g++
build-type: Release
run-test: false
ctest-options: -V
configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
- name: Run tests
run: make mk_run_tests
name: Python Tests
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
python-tests:
runs-on: ubuntu-latest
container: axsauze/kompute-builder:0.4
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: Install Python Requirements
run: pip3 install --user -r python/test/requirements-dev.txt
- name: Python Build
env:
KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
run: pip3 install --user . -v
- name: Python run Tests
run: |
export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
make test_python
# Compiled source #
###################
*.com
*.class
*.dll
*.exe
*.o
*.so
# Packages #
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
# Logs and databases #
######################
*.log
*.sql
*.sqlite
# OS generated files #
######################
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Python
__pycache__
*.pyc
dist/
kp.egg-info/
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
# TAgs
tags
tags.*
# Visual Studio 2015 user specific files
.vs/
# Visual Studio 2015 database file
*.VC.db
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
*.ipa
# These project files can be generated by the engine
*.xcodeproj
*.xcworkspace
*.sln
*.suo
*.opensdf
*.sdf
*.VC.db
*.VC.opendb
# Precompiled Assets
SourceArt/**/*.png
SourceArt/**/*.tga
# Binary Files
Binaries/*
Plugins/*/Binaries/*
# Builds
Build/*
# Whitelist PakBlacklist-<BuildConfiguration>.txt files
!Build/*/
Build/*/**
!Build/*/PakBlacklist*.txt
# Don't ignore icon files in Build
!Build/**/*.ico
# Built data for maps
*_BuiltData.uasset
# Configuration files generated by the Editor
Saved/*
# Compiled source files for the engine to use
Intermediate/*
Plugins/*/Intermediate/*
# Cache files for the editor to use
DerivedDataCache/*
# Starter Content Ignored
Content/StarterContent/*
# VSCode Files
/.vscode/*
BuildingEscape.code-workspace
compile_commands.json
.clangd/
.cache/
# Project files
bin/
external/boost/
tmp/
# CMake
build/
release/
# Kompute #
###################
swiftshader/
vk_swiftshader_icd.json
tmp_kp_shader.comp.spv
tmp_kp_shader.comp
# Docs
_build/
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment