Unverified Commit 3ea7b945 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Move linting to `pre-commit` (#11975)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 51ef828f
...@@ -43,7 +43,7 @@ main() { ...@@ -43,7 +43,7 @@ main() {
# The figures should be genereated by a separate process outside the CI/CD pipeline # The figures should be generated by a separate process outside the CI/CD pipeline
# # generate figures # # generate figures
# python3 -m pip install tabulate pandas matplotlib # python3 -m pip install tabulate pandas matplotlib
......
name: Lint GitHub Actions workflows
on:
push:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'
pull_request:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Run actionlint"
run: |
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color
name: clang-format
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- '**/*.h'
- '**/*.cpp'
- '**/*.cu'
- '**/*.cuh'
- '.github/workflows/clang-format.yml'
pull_request:
branches:
- main
paths:
- '**/*.h'
- '**/*.cpp'
- '**/*.cu'
- '**/*.cuh'
- '.github/workflows/clang-format.yml'
jobs:
clang-format:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install clang-format==18.1.5
- name: Running clang-format
run: |
EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu'
'csrc/quantization/gguf/ggml-common.h'
'csrc/quantization/gguf/dequantize.cuh'
'csrc/quantization/gguf/vecdotq.cuh'
'csrc/quantization/gguf/mmq.cuh'
'csrc/quantization/gguf/mmvq.cuh'
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| xargs clang-format --dry-run --Werror
name: codespell
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- "**/*.md"
- "**/*.rst"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/codespell.yml
pull_request:
branches:
- main
paths:
- "**/*.py"
- "**/*.md"
- "**/*.rst"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/codespell.yml
jobs:
codespell:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml
name: Lint documentation
on:
push:
branches:
- main
paths:
- "docs/**"
pull_request:
branches:
- main
paths:
- "docs/**"
jobs:
doc-lint:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Linting docs
run: tools/doc-lint.sh
name: dummy-checks
on:
pull_request:
jobs:
mypy:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- run: echo "This is a dummy step that always passes"
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- run: echo "This is a dummy step that always passes"
{
"problemMatcher": [
{
"owner": "ruff",
"pattern": [
{
"regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
"file": 1,
"line": 2,
"column": 3,
"code": 4,
"message": 5
}
]
}
]
}
name: mypy
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- '**/*.py'
- '.github/workflows/mypy.yaml'
- 'tools/mypy.sh'
- 'pyproject.toml'
pull_request:
branches:
- main
# This workflow is only relevant when one of the following files changes.
# However, we have github configured to expect and require this workflow
# to run and pass before github with auto-merge a pull request. Until github
# allows more flexible auto-merge policy, we can just run this on every PR.
# It doesn't take that long to run, anyway.
#paths:
# - '**/*.py'
# - '.github/workflows/mypy.yaml'
# - 'tools/mypy.sh'
# - 'pyproject.toml'
jobs:
mypy:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install mypy==1.11.1
pip install types-setuptools
pip install types-PyYAML
pip install types-requests
pip install types-setuptools
- name: Mypy
run: |
echo "::add-matcher::.github/workflows/matchers/mypy.json"
tools/mypy.sh 1 ${{ matrix.python-version }}
name: Lint PNG exports from excalidraw
on:
push:
branches:
- "main"
paths:
- '*.excalidraw.png'
- '.github/workflows/png-lint.yml'
pull_request:
branches:
- "main"
paths:
- '*.excalidraw.png'
- '.github/workflows/png-lint.yml'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Run png-lint.sh to check excalidraw exported images"
run: |
tools/png-lint.sh
name: pre-commit
on:
pull_request:
push:
branches: [main]
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
name: ruff
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/matchers/ruff.json
- .github/workflows/ruff.yml
pull_request:
branches:
- main
# This workflow is only relevant when one of the following files changes.
# However, we have github configured to expect and require this workflow
# to run and pass before github with auto-merge a pull request. Until github
# allows more flexible auto-merge policy, we can just run this on every PR.
# It doesn't take that long to run, anyway.
#paths:
# - "**/*.py"
# - pyproject.toml
# - requirements-lint.txt
# - .github/workflows/matchers/ruff.json
# - .github/workflows/ruff.yml
jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github .
- name: Run isort
run: |
isort . --check-only
name: Lint shell scripts
on:
push:
branches:
- "main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
pull_request:
branches:
- "main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
shellcheck:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Check shell scripts"
run: |
tools/shellcheck.sh
name: yapf
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- .github/workflows/yapf.yml
pull_request:
branches:
- main
paths:
- "**/*.py"
- .github/workflows/yapf.yml
jobs:
yapf:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
run: |
yapf --diff --recursive .
repos:
- repo: https://github.com/google/yapf
rev: v0.32.0
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.5
hooks:
- id: ruff
args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
- repo: https://github.com/PyCQA/isort
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.5
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.27
hooks:
- id: pymarkdown
files: docs/.*
- repo: local
hooks:
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: tools/mypy.sh 1 "3.9"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: tools/mypy.sh 1 "3.10"
language: python
types: [python]
additional_dependencies: *mypy_deps
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
entry: tools/mypy.sh 1 "3.11"
language: python
types: [python]
additional_dependencies: *mypy_deps
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
entry: tools/mypy.sh 1 "3.12"
language: python
types: [python]
additional_dependencies: *mypy_deps
- id: shellcheck
name: Lint shell scripts
entry: tools/shellcheck.sh
language: script
types: [shell]
- id: png-lint
name: Lint PNG exports from excalidraw
entry: tools/png-lint.sh
language: script
types: [png]
- repo: https://github.com/rhysd/actionlint
rev: v1.7.6
hooks:
- id: actionlint
...@@ -32,7 +32,7 @@ class ScalarType { ...@@ -32,7 +32,7 @@ class ScalarType {
signed_(signed_), signed_(signed_),
bias(bias), bias(bias),
finite_values_only(finite_values_only), finite_values_only(finite_values_only),
nan_repr(nan_repr){}; nan_repr(nan_repr) {};
static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) { static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
return ScalarType(0, size_bits - 1, true, bias); return ScalarType(0, size_bits - 1, true, bias);
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
#define CPU_TYPES_HPP #define CPU_TYPES_HPP
#if defined(__x86_64__) #if defined(__x86_64__)
//x86 implementation // x86 implementation
#include "cpu_types_x86.hpp" #include "cpu_types_x86.hpp"
#elif defined(__POWER9_VECTOR__) #elif defined(__POWER9_VECTOR__)
//ppc implementation // ppc implementation
#include "cpu_types_vsx.hpp" #include "cpu_types_vsx.hpp"
#elif defined(__aarch64__) #elif defined(__aarch64__)
//arm implementation // arm implementation
#include "cpu_types_arm.hpp" #include "cpu_types_arm.hpp"
#else #else
#warning "unsupported vLLM cpu implementation" #warning "unsupported vLLM cpu implementation"
......
This diff is collapsed.
...@@ -17,30 +17,32 @@ namespace vec_op { ...@@ -17,30 +17,32 @@ namespace vec_op {
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#ifndef CPU_OP_GUARD #ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME) #define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME) #define CPU_KERNEL_GUARD_OUT(NAME)
#else #else
#define CPU_KERNEL_GUARD_IN(NAME) \ #define CPU_KERNEL_GUARD_IN(NAME) \
std::cout << #NAME << " invoked." << std::endl; std::cout << #NAME << " invoked." << std::endl;
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; #define CPU_KERNEL_GUARD_OUT(NAME) \
std::cout << #NAME << " exit." << std::endl;
#endif #endif
#define FORCE_INLINE __attribute__((always_inline)) inline #define FORCE_INLINE __attribute__((always_inline)) inline
namespace { namespace {
template <typename T, T... indexes, typename F> template <typename T, T... indexes, typename F>
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) { constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
(f(std::integral_constant<T, indexes>{}), ...); (f(std::integral_constant<T, indexes>{}), ...);
} }
}; // namespace }; // namespace
template <typename T, T count, typename F, template <typename T, T count, typename F,
typename = std::enable_if_t<std::is_invocable_v<F, T>>> typename = std::enable_if_t<std::is_invocable_v<F, T>>>
constexpr void unroll_loop(F &&f) { constexpr void unroll_loop(F&& f) {
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f)); unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
} }
template <typename T> struct Vec { template <typename T>
struct Vec {
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
}; };
...@@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec<BF16Vec8> { ...@@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
__vector signed short reg; __vector signed short reg;
explicit BF16Vec8(const void *ptr) explicit BF16Vec8(const void* ptr)
: reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {}
explicit BF16Vec8(const FP32Vec8 &); explicit BF16Vec8(const FP32Vec8&);
void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } void save(void* ptr) const {
*reinterpret_cast<__vector signed short*>(ptr) = reg;
}
}; };
struct BF16Vec16 : public Vec<BF16Vec16> { struct BF16Vec16 : public Vec<BF16Vec16> {
...@@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec<BF16Vec16> { ...@@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
ss16x8x2_t reg; ss16x8x2_t reg;
explicit BF16Vec16(const void *ptr) { explicit BF16Vec16(const void* ptr) {
// Load 256 bits in two parts // Load 256 bits in two parts
reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
} }
explicit BF16Vec16(const FP32Vec16 &); explicit BF16Vec16(const FP32Vec16&);
void save(void *ptr) const { void save(void* ptr) const {
// Save 256 bits in two parts // Save 256 bits in two parts
vec_xst(reg.val[0], 0, (signed short *)ptr); vec_xst(reg.val[0], 0, (signed short*)ptr);
vec_xst(reg.val[1], 16, (signed short *)ptr); vec_xst(reg.val[1], 16, (signed short*)ptr);
} }
}; };
...@@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec<BF16Vec32> { ...@@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
constexpr static int VEC_ELEM_NUM = 32; constexpr static int VEC_ELEM_NUM = 32;
ss16x8x4_t reg; ss16x8x4_t reg;
explicit BF16Vec32(const void *ptr) explicit BF16Vec32(const void* ptr)
: reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {} : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ explicit BF16Vec32(const BF16Vec8& vec8_data)
vec8_data.reg, : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
vec8_data.reg,
vec8_data.reg,
vec8_data.reg
}) {}
void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; } void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
}; };
struct FP32Vec4 : public Vec<FP32Vec4> { struct FP32Vec4 : public Vec<FP32Vec4> {
...@@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> { ...@@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
explicit FP32Vec4() : reg(vec_splats(0.0f)) {} explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
explicit FP32Vec4(__vector float data) : reg(data) {} explicit FP32Vec4(__vector float data) : reg(data) {}
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
}; };
struct FP32Vec8 : public Vec<FP32Vec8> { struct FP32Vec8 : public Vec<FP32Vec8> {
...@@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec<FP32Vec8> { ...@@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
reg.val[1] = vec_splats(0.0f); reg.val[1] = vec_splats(0.0f);
} }
explicit FP32Vec8(const float *ptr) { explicit FP32Vec8(const float* ptr) {
reg.val[0] = vec_xl(0, ptr); reg.val[0] = vec_xl(0, ptr);
reg.val[1] = vec_xl(16, ptr); reg.val[1] = vec_xl(16, ptr);
} }
explicit FP32Vec8(f32x4x2_t data) : reg(data) {} explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
explicit FP32Vec8(const FP32Vec8 &data) { explicit FP32Vec8(const FP32Vec8& data) {
reg.val[0] = data.reg.val[0]; reg.val[0] = data.reg.val[0];
reg.val[1] = data.reg.val[1]; reg.val[1] = data.reg.val[1];
} }
explicit FP32Vec8(const BF16Vec8 &v) { explicit FP32Vec8(const BF16Vec8& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg); reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
} }
...@@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec<FP32Vec8> { ...@@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
AliasReg ar; AliasReg ar;
ar.reg = reg; ar.reg = reg;
float result = 0; float result = 0;
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; }); unroll_loop<int, VEC_ELEM_NUM>(
[&result, &ar](int i) { result += ar.values[i]; });
return result; return result;
} }
...@@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec<FP32Vec8> { ...@@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
} }
FP32Vec8 operator*(const FP32Vec8 &b) const { FP32Vec8 operator*(const FP32Vec8& b) const {
return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); return FP32Vec8(
{vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
} }
FP32Vec8 operator+(const FP32Vec8 &b) const { FP32Vec8 operator+(const FP32Vec8& b) const {
return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); return FP32Vec8(
{vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
} }
FP32Vec8 operator-(const FP32Vec8 &b) const { FP32Vec8 operator-(const FP32Vec8& b) const {
return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); return FP32Vec8(
{vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
} }
FP32Vec8 operator/(const FP32Vec8 &b) const { FP32Vec8 operator/(const FP32Vec8& b) const {
return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); return FP32Vec8(
{vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
} }
void save(float *ptr) const { void save(float* ptr) const {
vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[0], 0, ptr);
vec_xst(reg.val[1], 16, ptr); vec_xst(reg.val[1], 16, ptr);
} }
...@@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
reg.val[3] = vec_splats(0.0f); reg.val[3] = vec_splats(0.0f);
} }
explicit FP32Vec16(const float *ptr) { explicit FP32Vec16(const float* ptr) {
reg.val[0] = vec_xl(0, ptr); reg.val[0] = vec_xl(0, ptr);
reg.val[1] = vec_xl(16, ptr); reg.val[1] = vec_xl(16, ptr);
reg.val[2] = vec_xl(32, ptr); reg.val[2] = vec_xl(32, ptr);
...@@ -284,63 +289,59 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -284,63 +289,59 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit FP32Vec16(f32x4x4_t data) : reg(data) {} explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
explicit FP32Vec16(const FP32Vec16 &data) { explicit FP32Vec16(const FP32Vec16& data) {
reg.val[0] = data.reg.val[0]; reg.val[0] = data.reg.val[0];
reg.val[1] = data.reg.val[1]; reg.val[1] = data.reg.val[1];
reg.val[2] = data.reg.val[2]; reg.val[2] = data.reg.val[2];
reg.val[3] = data.reg.val[3]; reg.val[3] = data.reg.val[3];
} }
explicit FP32Vec16(const FP32Vec4 &data) { explicit FP32Vec16(const FP32Vec4& data) {
reg.val[0] = data.reg; reg.val[0] = data.reg;
reg.val[1] = data.reg; reg.val[1] = data.reg;
reg.val[2] = data.reg; reg.val[2] = data.reg;
reg.val[3] = data.reg; reg.val[3] = data.reg;
} }
explicit FP32Vec16(const FP32Vec8 &data) { explicit FP32Vec16(const FP32Vec8& data) {
reg.val[0] = data.reg.val[0]; reg.val[0] = data.reg.val[0];
reg.val[1] = data.reg.val[1]; reg.val[1] = data.reg.val[1];
reg.val[2] = data.reg.val[0]; reg.val[2] = data.reg.val[0];
reg.val[3] = data.reg.val[1]; reg.val[3] = data.reg.val[1];
} }
explicit FP32Vec16(const BF16Vec16 &v) { explicit FP32Vec16(const BF16Vec16& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
} }
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
FP32Vec16 operator*(const FP32Vec16 &b) const { FP32Vec16 operator*(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({ return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
vec_mul(reg.val[0], b.reg.val[0]),
vec_mul(reg.val[1], b.reg.val[1]), vec_mul(reg.val[1], b.reg.val[1]),
vec_mul(reg.val[2], b.reg.val[2]), vec_mul(reg.val[2], b.reg.val[2]),
vec_mul(reg.val[3], b.reg.val[3])})); vec_mul(reg.val[3], b.reg.val[3])}));
} }
FP32Vec16 operator+(const FP32Vec16 &b) const { FP32Vec16 operator+(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({ return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
vec_add(reg.val[0], b.reg.val[0]),
vec_add(reg.val[1], b.reg.val[1]), vec_add(reg.val[1], b.reg.val[1]),
vec_add(reg.val[2], b.reg.val[2]), vec_add(reg.val[2], b.reg.val[2]),
vec_add(reg.val[3], b.reg.val[3])})); vec_add(reg.val[3], b.reg.val[3])}));
} }
FP32Vec16 operator-(const FP32Vec16 &b) const { FP32Vec16 operator-(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({ return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
vec_sub(reg.val[0], b.reg.val[0]),
vec_sub(reg.val[1], b.reg.val[1]), vec_sub(reg.val[1], b.reg.val[1]),
vec_sub(reg.val[2], b.reg.val[2]), vec_sub(reg.val[2], b.reg.val[2]),
vec_sub(reg.val[3], b.reg.val[3])})); vec_sub(reg.val[3], b.reg.val[3])}));
} }
FP32Vec16 operator/(const FP32Vec16 &b) const { FP32Vec16 operator/(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({ return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
vec_div(reg.val[0], b.reg.val[0]),
vec_div(reg.val[1], b.reg.val[1]), vec_div(reg.val[1], b.reg.val[1]),
vec_div(reg.val[2], b.reg.val[2]), vec_div(reg.val[2], b.reg.val[2]),
vec_div(reg.val[3], b.reg.val[3])})); vec_div(reg.val[3], b.reg.val[3])}));
...@@ -350,12 +351,14 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -350,12 +351,14 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
AliasReg ar; AliasReg ar;
ar.reg = reg; ar.reg = reg;
float result = 0; float result = 0;
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; }); unroll_loop<int, VEC_ELEM_NUM>(
[&result, &ar](int i) { result += ar.values[i]; });
return result; return result;
} }
template <int group_size> float reduce_sub_sum(int idx) { template <int group_size>
float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0); static_assert(VEC_ELEM_NUM % group_size == 0);
AliasReg ar; AliasReg ar;
...@@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
return result; return result;
} }
void save(float *ptr) const { void save(float* ptr) const {
vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[0], 0, ptr);
vec_xst(reg.val[1], 16, ptr); vec_xst(reg.val[1], 16, ptr);
vec_xst(reg.val[2], 32, ptr); vec_xst(reg.val[2], 32, ptr);
...@@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
} }
}; };
template <typename T> struct VecType { using vec_type = void; }; template <typename T>
struct VecType {
using vec_type = void;
};
template <typename T> using vec_t = typename VecType<T>::vec_type; template <typename T>
using vec_t = typename VecType<T>::vec_type;
template <> struct VecType<float> { using vec_type = FP32Vec8; }; template <>
struct VecType<float> {
using vec_type = FP32Vec8;
};
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; }; template <>
struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
};
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; } template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
acc = acc + a * b; acc = acc + a * b;
} }
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { template <>
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
reinterpret_cast<c10::BFloat16 *>(&v); c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
reinterpret_cast<c10::BFloat16*>(&v);
*ptr = *(v_ptr + 1); *ptr = *(v_ptr + 1);
} }
#ifndef __VEC_CLASS_FP_NAN #ifndef __VEC_CLASS_FP_NAN
#define __VEC_CLASS_FP_NAN (1 << 6) #define __VEC_CLASS_FP_NAN (1 << 6)
#endif #endif
const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; const static __vector unsigned char omask = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
#ifndef _ARCH_PWR10 #ifndef _ARCH_PWR10
const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; 0x00007fff};
const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
const static __vector unsigned int one = { 1, 1, 1, 1 }; 0x7fc00000};
const static __vector unsigned int sh16 = {16, 16, 16, 16};
const static __vector unsigned int one = {1, 1, 1, 1};
#endif #endif
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
#ifdef _ARCH_PWR10 #ifdef _ARCH_PWR10
__vector signed short ret[2]; __vector signed short ret[2];
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); (__vector unsigned char)v.reg.val[0]);
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
(__vector unsigned char)v.reg.val[1]);
reg = vec_perm(ret[0], ret[1], omask); reg = vec_perm(ret[0], ret[1], omask);
#elif defined(_ARCH_PWR9) #elif defined(_ARCH_PWR9)
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
...@@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { ...@@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
__vector unsigned int rnd1 = vec_add(lsb1, bias); __vector unsigned int rnd1 = vec_add(lsb1, bias);
inp0 = vec_add(inp0, rnd0); inp0 = vec_add(inp0, rnd0);
inp1 = vec_add(inp1, rnd1); inp1 = vec_add(inp1, rnd1);
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); __vector __bool int sel0 =
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
__vector __bool int sel1 =
vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
inp0 = vec_sel(inp0, nan, sel0); inp0 = vec_sel(inp0, nan, sel0);
inp1 = vec_sel(inp1, nan, sel1); inp1 = vec_sel(inp1, nan, sel1);
inp0 = vec_sr(inp0, sh16); inp0 = vec_sr(inp0, sh16);
...@@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { ...@@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
#endif #endif
} }
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
#ifdef _ARCH_PWR10 #ifdef _ARCH_PWR10
__vector signed short ret[4]; __vector signed short ret[4];
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16(
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); (__vector unsigned char)v.reg.val[0]);
ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16(
ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); (__vector unsigned char)v.reg.val[1]);
ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16(
(__vector unsigned char)v.reg.val[2]);
ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16(
(__vector unsigned char)v.reg.val[3]);
reg.val[0] = vec_perm(ret[0], ret[1], omask); reg.val[0] = vec_perm(ret[0], ret[1], omask);
reg.val[1] = vec_perm(ret[2], ret[3], omask); reg.val[1] = vec_perm(ret[2], ret[3], omask);
#elif defined(_ARCH_PWR9) #elif defined(_ARCH_PWR9)
...@@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { ...@@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
inp1 = vec_add(inp1, rnd1); inp1 = vec_add(inp1, rnd1);
inp2 = vec_add(inp2, rnd2); inp2 = vec_add(inp2, rnd2);
inp3 = vec_add(inp3, rnd3); inp3 = vec_add(inp3, rnd3);
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); __vector __bool int sel0 =
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
__vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); __vector __bool int sel1 =
__vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
__vector __bool int sel2 =
vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
__vector __bool int sel3 =
vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
inp0 = vec_sel(inp0, nan, sel0); inp0 = vec_sel(inp0, nan, sel0);
inp1 = vec_sel(inp1, nan, sel1); inp1 = vec_sel(inp1, nan, sel1);
inp2 = vec_sel(inp2, nan, sel2); inp2 = vec_sel(inp2, nan, sel2);
...@@ -482,7 +514,7 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { ...@@ -482,7 +514,7 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
#endif #endif
} }
inline void prefetch(const void *addr) { inline void prefetch(const void* addr) {
__asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
} }
......
This diff is collapsed.
...@@ -27,8 +27,7 @@ ...@@ -27,8 +27,7 @@
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
int max_shared_mem_per_block_opt_in = 0; int max_shared_mem_per_block_opt_in = 0;
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
device);
return max_shared_mem_per_block_opt_in; return max_shared_mem_per_block_opt_in;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment