Unverified Commit cfd3219f authored by Wallas Henrique's avatar Wallas Henrique Committed by GitHub
Browse files

[Hardware][Apple] Native support for macOS Apple Silicon (#11696)


Signed-off-by: default avatarWallas Santos <wallashss@ibm.com>
Co-authored-by: default avatarMichael Goin <michael@neuralmagic.com>
parent a1b2b860
...@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) ...@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(MACOSX_FOUND TRUE)
endif()
# #
# Define environment variables for special configurations # Define environment variables for special configurations
# #
...@@ -13,6 +18,9 @@ endif() ...@@ -13,6 +18,9 @@ endif()
include_directories("${CMAKE_SOURCE_DIR}/csrc") include_directories("${CMAKE_SOURCE_DIR}/csrc")
set (ENABLE_NUMA TRUE)
# #
# Check the compile flags # Check the compile flags
# #
...@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") ...@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
"-mf16c" "-mf16c"
) )
endif() endif()
list(APPEND CXX_COMPILE_FLAGS
if(MACOSX_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-Xpreprocessor"
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
else()
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp" "-fopenmp"
"-DVLLM_CPU_EXTENSION") "-DVLLM_CPU_EXTENSION")
endif()
execute_process(COMMAND cat /proc/cpuinfo if (NOT MACOSX_FOUND)
execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO) OUTPUT_VARIABLE CPUINFO)
if (NOT CPUINFO_RET EQUAL 0)
if (NOT CPUINFO_RET EQUAL 0)
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
endif()
endif() endif()
function (find_isa CPUINFO TARGET OUT) function (find_isa CPUINFO TARGET OUT)
string(FIND ${CPUINFO} ${TARGET} ISA_FOUND) string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
if(NOT ISA_FOUND EQUAL -1) if(NOT ISA_FOUND EQUAL -1)
...@@ -54,12 +72,17 @@ endfunction() ...@@ -54,12 +72,17 @@ endfunction()
is_avx512_disabled(AVX512_DISABLED) is_avx512_disabled(AVX512_DISABLED)
find_isa(${CPUINFO} "avx2" AVX2_FOUND) if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
find_isa(${CPUINFO} "avx512f" AVX512_FOUND) set(APPLE_SILICON_FOUND TRUE)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND) else()
find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
list(APPEND CXX_COMPILE_FLAGS list(APPEND CXX_COMPILE_FLAGS
...@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND) ...@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
endif() endif()
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
elseif(APPLE_SILICON_FOUND)
message(STATUS "Apple Silicon Detected")
set(ENABLE_NUMA OFF)
else() else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
endif() endif()
...@@ -139,7 +165,12 @@ endif() ...@@ -139,7 +165,12 @@ endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS numa) if(ENABLE_NUMA)
list(APPEND LIBS numa)
else()
message(STATUS "NUMA is disabled")
add_compile_definitions(-DVLLM_NUMA_DISABLED)
endif()
# #
# _C extension # _C extension
......
...@@ -92,10 +92,67 @@ struct FP16Vec16 : public Vec<FP16Vec16> { ...@@ -92,10 +92,67 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
} }
} }
// Note: below is the unrolled version of the following code:
//
// for (int i = 0; i < remainder; ++i) {
// reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
// vgetq_lane_f16(temp, i);
// }
//
// For macOS build (Clang), the arm/neon intrinsics function
// `vgetq_lane_f16` needs the parameter `i` to be constant at compile
// time.
if (remainder > 0) { if (remainder > 0) {
float16x8_t temp = reg.val[full_blocks]; float16x8_t temp = reg.val[full_blocks];
for (int i = 0; i < remainder; ++i) { __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i); switch (remainder)
{
case 1:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
break;
case 2:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
break;
case 3:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
break;
case 4:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
break;
case 5:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
break;
case 6:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
break;
case 7:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
break;
default:
break;
} }
} }
} }
......
#include <numa.h> #ifndef VLLM_NUMA_DISABLED
#include <unistd.h> #include <numa.h>
#include <string> #include <unistd.h>
#include <sched.h> #include <string>
#include <sched.h>
#endif
#include "cpu_types.hpp" #include "cpu_types.hpp"
#ifdef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) {
return std::string(
"Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
"no effect to setup thread affinity.");
}
#endif
#ifndef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) { std::string init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask->size > 0); TORCH_CHECK(omp_cpu_mask->size > 0);
...@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { ...@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
omp_lock_t writelock; omp_lock_t writelock;
omp_init_lock(&writelock); omp_init_lock(&writelock);
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for schedule(static, 1)
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) { for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
cpu_set_t mask; cpu_set_t mask;
CPU_ZERO(&mask); CPU_ZERO(&mask);
...@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { ...@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
return ss.str(); return ss.str();
} }
#endif
\ No newline at end of file
(installation-apple)=
# Installation for macOS
vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
## Requirements
- **Operating System**: `macOS Sonoma` or later
- **SDK** `XCode 15.4` or later with Command Line Tools
- **Compilers**: `Apple Clang >= 15.0.0`
<!-- (arm-backend-quick-start-dockerfile)= -->
## Build and installation
After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
```
$ git clone https://github.com/vllm-project/vllm.git
$ cd vllm
$ pip install -r requirements-cpu.txt
$ pip install -e .
```
```{note}
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
```
## Troubleshooting
If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
```
[...] fatal error: 'map' file not found
1 | #include <map>
| ^~~~~
1 error generated.
[2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
[...] fatal error: 'cstddef' file not found
10 | #include <cstddef>
| ^~~~~~~~~
1 error generated.
```
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# Installation for ARM CPUs # Installation for ARM CPUs
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
- CPU backend inference capabilities - CPU backend inference capabilities
- Relevant runtime environment variables - Relevant runtime environment variables
...@@ -20,7 +20,7 @@ Contents: ...@@ -20,7 +20,7 @@ Contents:
## Requirements ## Requirements
- **Operating System**: Linux or macOS - **Operating System**: Linux or macOS
- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended) - **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
- **Instruction Set Architecture (ISA)**: NEON support is required - **Instruction Set Architecture (ISA)**: NEON support is required
(arm-backend-quick-start-dockerfile)= (arm-backend-quick-start-dockerfile)=
......
...@@ -11,6 +11,7 @@ gpu-cuda ...@@ -11,6 +11,7 @@ gpu-cuda
gpu-rocm gpu-rocm
cpu-x86 cpu-x86
cpu-arm cpu-arm
cpu-apple
hpu-gaudi hpu-gaudi
tpu tpu
xpu xpu
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for CPUs # Dependencies for CPUs
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
torch==2.5.1; platform_machine == "aarch64" torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
datasets # for benchmark scripts datasets # for benchmark scripts
...@@ -34,9 +34,14 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) ...@@ -34,9 +34,14 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
if not sys.platform.startswith("linux"): if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
logger.warning( logger.warning(
"vLLM only supports Linux platform (including WSL). " "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
VLLM_TARGET_DEVICE = "cpu"
elif not (sys.platform.startswith("linux")
or sys.platform.startswith("darwin")):
logger.warning(
"vLLM only supports Linux platform (including WSL) and MacOS."
"Building on %s, " "Building on %s, "
"so vLLM may not be able to run correctly", sys.platform) "so vLLM may not be able to run correctly", sys.platform)
VLLM_TARGET_DEVICE = "empty" VLLM_TARGET_DEVICE = "empty"
......
...@@ -4,6 +4,7 @@ import enum ...@@ -4,6 +4,7 @@ import enum
import hashlib import hashlib
import json import json
import os import os
import sys
import warnings import warnings
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass, field, replace from dataclasses import dataclass, field, replace
...@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype( ...@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype(
"supported for POWERPC.") "supported for POWERPC.")
torch_dtype = torch.bfloat16 torch_dtype = torch.bfloat16
# TODO: change this condition to check if the platform support bf16
# instead of checking the OS. For instance M2 shall supports bf16
# already. But we need to modify `cpu_extension.cmake` to activate
# the feature in the build.
if (current_platform.is_cpu() and sys.platform.startswith("darwin")
and current_platform.get_cpu_architecture()
== CpuArchEnum.ARM and config_dtype == torch.bfloat16):
logger.info("For macOS with Apple Silicon, currently bfloat16 "
"is not supported. Setting dtype to float16.")
torch_dtype = torch.float16
if current_platform.is_hpu() and config_dtype == torch.float16: if current_platform.is_hpu() and config_dtype == torch.float16:
logger.info( logger.info(
"For HPU, we cast models to bfloat16 instead of" "For HPU, we cast models to bfloat16 instead of"
......
...@@ -7,6 +7,7 @@ import os ...@@ -7,6 +7,7 @@ import os
import re import re
import signal import signal
import socket import socket
import sys
import tempfile import tempfile
import uuid import uuid
from argparse import Namespace from argparse import Namespace
...@@ -805,6 +806,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: ...@@ -805,6 +806,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
ssl_certfile=args.ssl_certfile, ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs, ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs, ssl_cert_reqs=args.ssl_cert_reqs,
# Workaround to work on macOS
fd=sock.fileno() if sys.platform.startswith("darwin") else None,
**uvicorn_kwargs, **uvicorn_kwargs,
) )
......
...@@ -524,6 +524,13 @@ def get_open_port() -> int: ...@@ -524,6 +524,13 @@ def get_open_port() -> int:
def find_process_using_port(port: int) -> Optional[psutil.Process]: def find_process_using_port(port: int) -> Optional[psutil.Process]:
# TODO: We can not check for running processes with network
# port on macOS. Therefore, we can not have a full graceful shutdown
# of vLLM. For now, let's not look for processes in this case.
# Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
if sys.platform.startswith("darwin"):
return None
for conn in psutil.net_connections(): for conn in psutil.net_connections():
if conn.laddr.port == port: if conn.laddr.port == port:
try: try:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment