Commit adc559a8 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Add static build of KenLM (#2076)

Summary:
Add KenLM and its dependencies required for static build (`zlib`, `bzip2`, `lzma` and `boost-thread`).

The KenLM and its dependencies are build but since no corresponding code on torchaudio side is changed, the resulting torchaudio extension module is not changed. (therefore, as long as build process passes on CI this PR should be good to go.)

Pull Request resolved: https://github.com/pytorch/audio/pull/2076

Reviewed By: carolineechen

Differential Revision: D33189980

Pulled By: mthrok

fbshipit-source-id: 6096113128b939f3cf70990c99aacc4aaa954584
parent c02faf04
...@@ -127,4 +127,5 @@ examples/tutorials/_assets ...@@ -127,4 +127,5 @@ examples/tutorials/_assets
# third parties # third parties
third_party/install/ third_party/install/
third_party/archives/
third_party/sox/archives/ third_party/sox/archives/
...@@ -2,3 +2,6 @@ ...@@ -2,3 +2,6 @@
path = third_party/kaldi/submodule path = third_party/kaldi/submodule
url = https://github.com/kaldi-asr/kaldi url = https://github.com/kaldi-asr/kaldi
ignore = dirty ignore = dirty
[submodule "third_party/kenlm/submodule"]
path = third_party/kenlm/submodule
url = https://github.com/kpu/kenlm
...@@ -59,6 +59,7 @@ endif() ...@@ -59,6 +59,7 @@ endif()
option(BUILD_SOX "Build libsox statically" ON) option(BUILD_SOX "Build libsox statically" ON)
option(BUILD_KALDI "Build kaldi statically" ON) option(BUILD_KALDI "Build kaldi statically" ON)
option(BUILD_RNNT "Enable RNN transducer" ON) option(BUILD_RNNT "Enable RNN transducer" ON)
option(BUILD_KENLM "Build KenLM statically" ON)
option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF) option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)
option(USE_CUDA "Enable CUDA support" OFF) option(USE_CUDA "Enable CUDA support" OFF)
option(USE_ROCM "Enable ROCM support" OFF) option(USE_ROCM "Enable ROCM support" OFF)
......
...@@ -94,22 +94,39 @@ def _init_submodule(): ...@@ -94,22 +94,39 @@ def _init_submodule():
print(' --- Initialized submodule') print(' --- Initialized submodule')
def _parse_url(path):
with open(path, 'r') as file_:
for line in file_:
match = re.match(r'^\s*URL\s+(https:\/\/.+)$', line)
if match:
url = match.group(1)
yield url
def _parse_sox_sources(): def _parse_sox_sources():
sox_dir = ROOT_DIR / 'third_party' / 'sox' sox_dir = ROOT_DIR / 'third_party' / 'sox'
cmake_file = sox_dir / 'CMakeLists.txt' cmake_file = sox_dir / 'CMakeLists.txt'
archive_dir = sox_dir / 'archives' archive_dir = sox_dir / 'archives'
archive_dir.mkdir(exist_ok=True) archive_dir.mkdir(exist_ok=True)
with open(cmake_file, 'r') as file_: for url in _parse_url(cmake_file):
for line in file_: path = archive_dir / os.path.basename(url)
match = re.match(r'^\s*URL\s+(https:\/\/.+)$', line) yield path, url
if match:
url = match.group(1)
def _parse_kenlm_sources():
third_party_dir = ROOT_DIR / 'third_party'
libs = ['zlib', 'bzip2', 'lzma', 'boost']
archive_dir = third_party_dir / 'archives'
archive_dir.mkdir(exist_ok=True)
for lib in libs:
cmake_file = third_party_dir / lib / 'CMakeLists.txt'
for url in _parse_url(cmake_file):
path = archive_dir / os.path.basename(url) path = archive_dir / os.path.basename(url)
yield path, url yield path, url
def _fetch_sox_archives(): def _fetch_archives(src):
for dest, url in _parse_sox_sources(): for dest, url in src:
if not dest.exists(): if not dest.exists():
print(f' --- Fetching {os.path.basename(dest)}') print(f' --- Fetching {os.path.basename(dest)}')
torch.hub.download_url_to_file(url, dest, progress=False) torch.hub.download_url_to_file(url, dest, progress=False)
...@@ -119,7 +136,8 @@ def _fetch_third_party_libraries(): ...@@ -119,7 +136,8 @@ def _fetch_third_party_libraries():
if not (ROOT_DIR / 'third_party' / 'kaldi' / 'submodule' / 'CMakeLists.txt').exists(): if not (ROOT_DIR / 'third_party' / 'kaldi' / 'submodule' / 'CMakeLists.txt').exists():
_init_submodule() _init_submodule()
if os.name != 'nt': if os.name != 'nt':
_fetch_sox_archives() _fetch_archives(_parse_sox_sources())
_fetch_archives(_parse_kenlm_sources())
def _main(): def _main():
......
...@@ -22,3 +22,14 @@ if (BUILD_KALDI) ...@@ -22,3 +22,14 @@ if (BUILD_KALDI)
endif() endif()
set_property(GLOBAL PROPERTY TORCHAUDIO_THIRD_PARTIES "${TORCHAUDIO_THIRD_PARTIES}") set_property(GLOBAL PROPERTY TORCHAUDIO_THIRD_PARTIES "${TORCHAUDIO_THIRD_PARTIES}")
################################################################################
# KenLM
################################################################################
if (BUILD_KENLM)
add_subdirectory(zlib)
add_subdirectory(bzip2)
add_subdirectory(lzma)
add_subdirectory(boost)
add_subdirectory(kenlm)
endif()
include(ExternalProject)
set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
ExternalProject_Add(boost_
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://boostorg.jfrog.io/artifactory/main/release/1.78.0/source/boost_1_78_0.tar.gz
URL_HASH SHA256=94ced8b72956591c4775ae2207a9763d3600b30d9d7446562c552f0a14a63be7
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND ./bootstrap.sh --with-libraries=thread --prefix=${INSTALL_DIR}
BUILD_COMMAND ./b2 link=static
INSTALL_COMMAND ./b2 link=static install
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
add_library(boost INTERFACE)
add_dependencies(boost boost_)
target_include_directories(boost INTERFACE ${INSTALL_DIR}/include)
target_link_libraries(boost INTERFACE ${INSTALL_DIR}/lib/libboost_thread.a)
include(ExternalProject)
set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
ExternalProject_Add(bzip2_
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://sourceware.org/pub/bzip2/bzip2-1.0.8.tar.gz
URL_HASH SHA256=ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND ""
INSTALL_COMMAND make install PREFIX=${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
add_library(bzip2 INTERFACE)
add_dependencies(bzip2 bzip2_)
target_include_directories(bzip2 INTERFACE ${INSTALL_DIR}/include)
target_link_libraries(bzip2 INTERFACE ${INSTALL_DIR}/lib/libbz2.a)
set(
KENLM_UTIL_SOURCES
submodule/util/bit_packing.cc
submodule/util/double-conversion/bignum.cc
submodule/util/double-conversion/bignum-dtoa.cc
submodule/util/double-conversion/cached-powers.cc
submodule/util/double-conversion/diy-fp.cc
submodule/util/double-conversion/double-conversion.cc
submodule/util/double-conversion/fast-dtoa.cc
submodule/util/double-conversion/fixed-dtoa.cc
submodule/util/double-conversion/strtod.cc
submodule/util/ersatz_progress.cc
submodule/util/exception.cc
submodule/util/file.cc
submodule/util/file_piece.cc
submodule/util/float_to_string.cc
submodule/util/integer_to_string.cc
submodule/util/mmap.cc
submodule/util/murmur_hash.cc
submodule/util/parallel_read.cc
submodule/util/pool.cc
submodule/util/read_compressed.cc
submodule/util/scoped.cc
submodule/util/spaces.cc
submodule/util/stream/chain.cc
submodule/util/stream/count_records.cc
submodule/util/stream/io.cc
submodule/util/stream/line_input.cc
submodule/util/stream/multi_progress.cc
submodule/util/stream/rewindable_stream.cc
submodule/util/string_piece.cc
submodule/util/usage.cc
)
set(
KENLM_SOURCES
submodule/lm/bhiksha.cc
submodule/lm/binary_format.cc
submodule/lm/config.cc
submodule/lm/lm_exception.cc
submodule/lm/model.cc
submodule/lm/quantize.cc
submodule/lm/read_arpa.cc
submodule/lm/search_hashed.cc
submodule/lm/search_trie.cc
submodule/lm/sizes.cc
submodule/lm/trie.cc
submodule/lm/trie_sort.cc
submodule/lm/value_build.cc
submodule/lm/virtual_interface.cc
submodule/lm/vocab.cc
)
add_library(
kenlm
STATIC
"${KENLM_UTIL_SOURCES};${KENLM_SOURCES}"
)
target_include_directories(
kenlm
BEFORE
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../install/include"
PUBLIC submodule
)
target_compile_definitions(
kenlm
PUBLIC KENLM_MAX_ORDER=6
HAVE_ZLIB
HAVE_BZLIB
HAVE_XZLIB
)
target_link_libraries(
kenlm
zlib
bzip2
lzma
)
add_dependencies(kenlm boost)
Subproject commit 5cea457db26950a73d638425c183b368c06ed7c6
include(ExternalProject)
set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
# To pass custom environment variables to ExternalProject_Add command,
# we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
# https://stackoverflow.com/a/62437353
# We constrcut the custom environment variables here
set(envs
"PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
"LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
"CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
)
ExternalProject_Add(lzma_
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://tukaani.org/xz/xz-5.2.5.tar.gz
URL_HASH SHA256=f6f4910fd033078738bd82bfba4f49219d03b17eb0794eb91efbae419f4aba10
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/lzma_/configure --prefix=${INSTALL_DIR} --disable-xz --disable-xzdec --disable-lzmadec --disable-lzmainfo --disable-lzma-links --disable-scripts --disable-doc --enable-static --disable-shared
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
add_library(lzma INTERFACE)
add_dependencies(lzma lzma_)
target_include_directories(lzma INTERFACE ${INSTALL_DIR}/include)
target_link_libraries(lzma INTERFACE ${INSTALL_DIR}/lib/liblzma.a)
include(ExternalProject)
set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
# To pass custom environment variables to ExternalProject_Add command,
# we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
# https://stackoverflow.com/a/62437353
# We constrcut the custom environment variables here
set(envs
"PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
"LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
"CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
"prefix=${INSTALL_DIR}"
)
ExternalProject_Add(zlib_
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://zlib.net/zlib-1.2.11.tar.gz
URL_HASH SHA256=c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/zlib_/configure --static
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
add_library(zlib INTERFACE)
add_dependencies(zlib zlib_)
target_include_directories(zlib INTERFACE ${INSTALL_DIR}/include)
target_link_libraries(zlib INTERFACE ${INSTALL_DIR}/lib/libz.a)
...@@ -37,6 +37,7 @@ def _get_build(var, default=False): ...@@ -37,6 +37,7 @@ def _get_build(var, default=False):
_BUILD_SOX = False if platform.system() == 'Windows' else _get_build("BUILD_SOX", True) _BUILD_SOX = False if platform.system() == 'Windows' else _get_build("BUILD_SOX", True)
_BUILD_KALDI = False if platform.system() == 'Windows' else _get_build("BUILD_KALDI", True) _BUILD_KALDI = False if platform.system() == 'Windows' else _get_build("BUILD_KALDI", True)
_BUILD_RNNT = _get_build("BUILD_RNNT", True) _BUILD_RNNT = _get_build("BUILD_RNNT", True)
_BUILD_KENLM = False if platform.system() == 'Windows' else _get_build("BUILD_KENLM", True)
_USE_ROCM = _get_build("USE_ROCM", torch.cuda.is_available() and torch.version.hip is not None) _USE_ROCM = _get_build("USE_ROCM", torch.cuda.is_available() and torch.version.hip is not None)
_USE_CUDA = _get_build("USE_CUDA", torch.cuda.is_available() and torch.version.hip is None) _USE_CUDA = _get_build("USE_CUDA", torch.cuda.is_available() and torch.version.hip is None)
_USE_OPENMP = _get_build("USE_OPENMP", True) and \ _USE_OPENMP = _get_build("USE_OPENMP", True) and \
...@@ -89,6 +90,7 @@ class CMakeBuild(build_ext): ...@@ -89,6 +90,7 @@ class CMakeBuild(build_ext):
f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}", f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}", f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",
f"-DBUILD_RNNT:BOOL={'ON' if _BUILD_RNNT else 'OFF'}", f"-DBUILD_RNNT:BOOL={'ON' if _BUILD_RNNT else 'OFF'}",
f"-DBUILD_KENLM:BOOL={'ON' if _BUILD_KENLM else 'OFF'}",
"-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON", "-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON",
f"-DUSE_ROCM:BOOL={'ON' if _USE_ROCM else 'OFF'}", f"-DUSE_ROCM:BOOL={'ON' if _USE_ROCM else 'OFF'}",
f"-DUSE_CUDA:BOOL={'ON' if _USE_CUDA else 'OFF'}", f"-DUSE_CUDA:BOOL={'ON' if _USE_CUDA else 'OFF'}",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment