init

2025/04/10 15:55:52

init
2025/04/10 15:55:52
033f82a9 · guobj · ef72564b · 033f82a9 · 033f82a9 · 033f82a9
Commit 033f82a9 authored Apr 10, 2025 by guobj
20 changed files
--- a/kokoro.js/demo/package-lock.json
+++ b/kokoro.js/demo/package-lock.json
--- a/kokoro.js/demo/package.json
+++ b/kokoro.js/demo/package.json
+{
+  "name": "kokoro-web",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "lint": "eslint .",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "kokoro-js": "file:..",
+    "motion": "^11.12.0",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.15.0",
+    "@types/react": "^18.3.12",
+    "@types/react-dom": "^18.3.1",
+    "@vitejs/plugin-react": "^4.3.4",
+    "autoprefixer": "^10.4.20",
+    "eslint": "^9.15.0",
+    "eslint-plugin-react": "^7.37.2",
+    "eslint-plugin-react-hooks": "^5.0.0",
+    "eslint-plugin-react-refresh": "^0.4.14",
+    "globals": "^15.12.0",
+    "postcss": "^8.4.49",
+    "tailwindcss": "^3.4.15",
+    "vite": "^6.0.1"
+  }
+}
--- a/kokoro.js/demo/postcss.config.js
+++ b/kokoro.js/demo/postcss.config.js
+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+};
--- a/kokoro.js/demo/public/hf-logo.svg
+++ b/kokoro.js/demo/public/hf-logo.svg
--- a/kokoro.js/demo/public/wave.svg
+++ b/kokoro.js/demo/public/wave.svg
+<svg xmlns="http://www.w3.org/2000/svg" width="1600" height="198">
+  <defs>
+    <linearGradient id="a" x1="50%" x2="50%" y1="-10.959%" y2="100%">
+      <stop stop-color="#57BBC1" stop-opacity=".25" offset="0%"/>
+      <stop stop-color="#015871" offset="100%"/>
+    </linearGradient>
+  </defs>
+  <path fill="url(#a)" fill-rule="evenodd" d="M.005 121C311 121 409.898-.25 811 0c400 0 500 121 789 121v77H0s.005-48 .005-77z" transform="matrix(-1 0 0 1 1600 0)"/>
+</svg>
--- a/kokoro.js/demo/src/App.jsx
+++ b/kokoro.js/demo/src/App.jsx
+import { useRef, useState, useEffect } from "react";
+import { motion } from "motion/react";
+
+export default function App() {
+  // Create a reference to the worker object.
+  const worker = useRef(null);
+
+  const [inputText, setInputText] = useState("Life is like a box of chocolates. You never know what you're gonna get.");
+  const [selectedSpeaker, setSelectedSpeaker] = useState("af_heart");
+
+  const [voices, setVoices] = useState([]);
+  const [status, setStatus] = useState(null);
+  const [error, setError] = useState(null);
+  const [loadingMessage, setLoadingMessage] = useState("Loading...");
+
+  const [results, setResults] = useState([]);
+
+  // We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
+  useEffect(() => {
+    // Create the worker if it does not yet exist.
+    worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
+      type: "module",
+    });
+
+    // Create a callback function for messages from the worker thread.
+    const onMessageReceived = (e) => {
+      switch (e.data.status) {
+        case "device":
+          setLoadingMessage(`Loading model (device="${e.data.device}")`);
+          break;
+        case "ready":
+          setStatus("ready");
+          setVoices(e.data.voices);
+          break;
+        case "error":
+          setError(e.data.data);
+          break;
+        case "complete":
+          const { audio, text } = e.data;
+          // Generation complete: re-enable the "Generate" button
+          setResults((prev) => [{ text, src: audio }, ...prev]);
+          setStatus("ready");
+          break;
+      }
+    };
+
+    const onErrorReceived = (e) => {
+      console.error("Worker error:", e);
+      setError(e.message);
+    };
+
+    // Attach the callback function as an event listener.
+    worker.current.addEventListener("message", onMessageReceived);
+    worker.current.addEventListener("error", onErrorReceived);
+
+    // Define a cleanup function for when the component is unmounted.
+    return () => {
+      worker.current.removeEventListener("message", onMessageReceived);
+      worker.current.removeEventListener("error", onErrorReceived);
+    };
+  }, []);
+
+  const handleSubmit = (e) => {
+    e.preventDefault();
+    setStatus("running");
+
+    worker.current.postMessage({
+      type: "generate",
+      text: inputText.trim(),
+      voice: selectedSpeaker,
+    });
+  };
+
+  return (
+    <div className="relative w-full min-h-screen bg-gradient-to-br from-gray-900 to-gray-700 flex flex-col items-center justify-center p-4 relative overflow-hidden font-sans">
+      <motion.div initial={{ opacity: 1 }} animate={{ opacity: status === null ? 1 : 0 }} transition={{ duration: 0.5 }} className="absolute w-screen h-screen justify-center flex flex-col items-center z-10 bg-gray-800/95 backdrop-blur-md" style={{ pointerEvents: status === null ? "auto" : "none" }}>
+        <div className="w-[250px] h-[250px] border-4 border-white shadow-[0_0_0_5px_#4973ff] rounded-full overflow-hidden">
+          <div className="loading-wave"></div>
+        </div>
+        <p className={`text-3xl my-5 text-center ${error ? "text-red-500" : "text-white"}`}>{error ?? loadingMessage}</p>
+      </motion.div>
+
+      <div className="max-w-3xl w-full space-y-8 relative z-[2]">
+        <div className="text-center">
+          <h1 className="text-5xl font-extrabold text-gray-100 mb-2 drop-shadow-lg font-heading">Kokoro Text-to-Speech</h1>
+          <p className="text-2xl text-gray-300 font-semibold font-subheading">
+            Powered by&nbsp;
+            <a href="https://github.com/hexgrad/kokoro" target="_blank" rel="noreferrer" className="underline">
+              Kokoro
+            </a>
+            &nbsp;and&nbsp;
+            <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline">
+              <img width="40" src="hf-logo.svg" className="inline translate-y-[-2px] me-1"></img>Transformers.js
+            </a>
+          </p>
+        </div>
+        <div className="bg-gray-800/50 backdrop-blur-sm border border-gray-700 rounded-lg p-6">
+          <form onSubmit={handleSubmit} className="space-y-4">
+            <textarea placeholder="Enter text..." value={inputText} onChange={(e) => setInputText(e.target.value)} className="w-full min-h-[100px] max-h-[300px] bg-gray-700/50 backdrop-blur-sm border-2 border-gray-600 rounded-xl resize-y text-gray-100 placeholder-gray-400 px-3 py-2 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent" rows={Math.min(8, inputText.split("\n").length)} />
+            <div className="flex flex-col items-center space-y-4">
+              <select value={selectedSpeaker} onChange={(e) => setSelectedSpeaker(e.target.value)} className="w-full bg-gray-700/50 backdrop-blur-sm border-2 border-gray-600 rounded-xl text-gray-100 px-3 py-2 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent">
+                {Object.entries(voices).map(([id, voice]) => (
+                  <option key={id} value={id}>
+                    {voice.name} ({voice.language === "en-us" ? "American" : "British"} {voice.gender})
+                  </option>
+                ))}
+              </select>
+              <button type="submit" className="inline-flex justify-center items-center px-6 py-2 text-lg font-semibold bg-gradient-to-t from-blue-600 to-purple-600 hover:from-blue-700 hover:to-purple-700 transition-colors duration-300 rounded-xl text-white disabled:opacity-50" disabled={status === "running" || inputText.trim() === ""}>
+                {status === "running" ? "Generating..." : "Generate"}
+              </button>
+            </div>
+          </form>
+        </div>
+
+        {results.length > 0 && (
+          <motion.div initial={{ y: 50, opacity: 0 }} animate={{ y: 0, opacity: 1 }} transition={{ duration: 0.5 }} className="max-h-[250px] overflow-y-auto px-2 mt-4 space-y-6 relative z-[2]">
+            {results.map((result, i) => (
+              <div key={i}>
+                <div className="text-white bg-gray-800/70 backdrop-blur-sm border border-gray-700 rounded-lg p-4 z-10">
+                  <span className="absolute right-5 font-bold">#{results.length - i}</span>
+                  <p className="mb-3 max-w-[95%]">{result.text}</p>
+                  <audio controls src={result.src} className="w-full">
+                    Your browser does not support the audio element.
+                  </audio>
+                </div>
+              </div>
+            ))}
+          </motion.div>
+        )}
+      </div>
+
+      <div className="bg-[#015871] pointer-events-none absolute left-0 w-full h-[5%] bottom-[-50px]">
+        <div className="wave"></div>
+        <div className="wave"></div>
+      </div>
+    </div>
+  );
+}
--- a/kokoro.js/demo/src/index.css
+++ b/kokoro.js/demo/src/index.css
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+/*
+ * Wave animations adapted from the following two demos:
+ * - https://codepen.io/upasanaasopa/pen/poObEWZ
+ * - https://codepen.io/breakstorm00/pen/qBJZQNB
+ */
+
+*,
+*:before,
+*:after {
+  margin: 0;
+  padding: 0;
+  box-sizing: border-box;
+}
+
+.loading-wave {
+  position: relative;
+  top: 0;
+  width: 100%;
+  height: 100%;
+  background: #2c74b3;
+  border-radius: 50%;
+  box-shadow: inset 0 0 50px 0 rgba(0, 0, 0, 0.5);
+}
+
+.loading-wave:before,
+.loading-wave:after {
+  content: "";
+  position: absolute;
+  top: 0;
+  left: 50%;
+  width: 200%;
+  height: 200%;
+  background: black;
+  transform: translate(-50%, -75%);
+}
+
+.loading-wave:before {
+  border-radius: 45%;
+  background: rgba(255, 255, 255, 1);
+  animation: animate 5s linear infinite;
+}
+
+.loading-wave:after {
+  border-radius: 40%;
+  background: rgba(255, 255, 255, 0.5);
+  animation: animate 10s linear infinite;
+}
+
+.wave {
+  background: url(/wave.svg) repeat-x;
+  position: absolute;
+  top: -198px;
+  width: 6400px;
+  height: 198px;
+  animation: wave 7s cubic-bezier(0.36, 0.45, 0.63, 0.53) infinite;
+  transform: translate3d(0, 0, 0);
+}
+
+.wave:nth-of-type(2) {
+  top: -175px;
+  animation:
+    wave 7s cubic-bezier(0.36, 0.45, 0.63, 0.53) -0.125s infinite,
+    swell 7s ease -1.25s infinite;
+  opacity: 1;
+}
+
+@keyframes wave {
+  0% {
+    margin-left: 0;
+  }
+
+  100% {
+    margin-left: -1600px;
+  }
+}
+
+@keyframes swell {
+  0%,
+  100% {
+    transform: translate3d(0, -25px, 0);
+  }
+
+  50% {
+    transform: translate3d(0, 5px, 0);
+  }
+}
+
+@keyframes animate {
+  0% {
+    transform: translate(-50%, -75%) rotate(0deg);
+  }
+
+  100% {
+    transform: translate(-50%, -75%) rotate(360deg);
+  }
+}
--- a/kokoro.js/demo/src/main.jsx
+++ b/kokoro.js/demo/src/main.jsx
+import { StrictMode } from "react";
+import { createRoot } from "react-dom/client";
+import "./index.css";
+import App from "./App.jsx";
+
+createRoot(document.getElementById("root")).render(
+  <StrictMode>
+    <App />
+  </StrictMode>,
+);
--- a/kokoro.js/demo/src/utils.js
+++ b/kokoro.js/demo/src/utils.js
+export async function detectWebGPU() {
+  try {
+    const adapter = await navigator.gpu.requestAdapter();
+    return !!adapter;
+  } catch (e) {
+    return false;
+  }
+}
--- a/kokoro.js/demo/src/worker.js
+++ b/kokoro.js/demo/src/worker.js
+import { KokoroTTS } from "kokoro-js";
+import { detectWebGPU } from "./utils.js";
+
+// Device detection
+const device = (await detectWebGPU()) ? "webgpu" : "wasm";
+self.postMessage({ status: "device", device });
+
+// Load the model
+const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
+const tts = await KokoroTTS.from_pretrained(model_id, {
+  dtype: device === "wasm" ? "q8" : "fp32",
+  device,
+}).catch((e) => {
+  self.postMessage({ status: "error", error: e.message });
+  throw e;
+});
+self.postMessage({ status: "ready", voices: tts.voices, device });
+
+// Listen for messages from the main thread
+self.addEventListener("message", async (e) => {
+  const { text, voice } = e.data;
+
+  // Generate speech
+  const audio = await tts.generate(text, { voice });
+
+  // Send the audio file back to the main thread
+  const blob = audio.toBlob();
+  self.postMessage({ status: "complete", audio: URL.createObjectURL(blob), text });
+});
--- a/kokoro.js/demo/tailwind.config.js
+++ b/kokoro.js/demo/tailwind.config.js
+/** @type {import('tailwindcss').Config} */
+export default {
+  content: ["./index.html", "./src/**/*.{js,ts,jsx,tsx}"],
+  theme: {
+    extend: {},
+  },
+  plugins: [],
+};
--- a/kokoro.js/demo/vite.config.js
+++ b/kokoro.js/demo/vite.config.js
+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+
+// https://vite.dev/config/
+export default defineConfig({
+  plugins: [react()],
+  worker: { format: "es" },
+  build: {
+    target: "esnext",
+  },
+  logLevel: process.env.NODE_ENV === "development" ? "error" : "info",
+});
--- a/kokoro.js/package-lock.json
+++ b/kokoro.js/package-lock.json
--- a/kokoro.js/package.json
+++ b/kokoro.js/package.json
+{
+  "name": "kokoro-js",
+  "version": "1.2.0",
+  "type": "module",
+  "exports": {
+    "types": "./types/kokoro.d.ts",
+    "node": {
+      "import": "./dist/kokoro.js",
+      "require": "./dist/kokoro.cjs"
+    },
+    "default": "./dist/kokoro.web.js"
+  },
+  "scripts": {
+    "build": "rm -rf dist types && rollup -c && tsc && cp ../LICENSE LICENSE",
+    "format": "prettier --write . --print-width 1000",
+    "test": "vitest run"
+  },
+  "keywords": [
+    "kokoro",
+    "tts",
+    "text-to-speech"
+  ],
+  "author": {
+    "name": "hexgrad",
+    "email": "hello@hexgrad.com"
+  },
+  "browser": {
+    "path": false,
+    "fs/promises": false
+  },
+  "contributors": [
+    "Xenova"
+  ],
+  "license": "Apache-2.0",
+  "description": "High-quality text-to-speech for the web",
+  "dependencies": {
+    "@huggingface/transformers": "^3.3.3",
+    "phonemizer": "^1.2.1"
+  },
+  "devDependencies": {
+    "@rollup/plugin-node-resolve": "^16.0.0",
+    "@rollup/plugin-terser": "^0.4.4",
+    "prettier": "3.4.2",
+    "rollup": "^4.30.1",
+    "typescript": "^5.7.3",
+    "vitest": "^2.1.8"
+  },
+  "files": [
+    "types",
+    "dist",
+    "voices",
+    "README.md",
+    "LICENSE"
+  ],
+  "homepage": "https://github.com/hexgrad/kokoro",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/hexgrad/kokoro.git"
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "jsdelivr": "./dist/kokoro.web.js",
+  "unpkg": "./dist/kokoro.web.js"
+}
--- a/kokoro.js/rollup.config.js
+++ b/kokoro.js/rollup.config.js
+import terser from "@rollup/plugin-terser";
+import { nodeResolve } from "@rollup/plugin-node-resolve";
+
+const plugins = (browser) => [nodeResolve({ browser }), terser({ format: { comments: false } })];
+
+const OUTPUT_CONFIGS = [
+  // Node versions
+  {
+    file: "./dist/kokoro.cjs",
+    format: "cjs",
+  },
+  {
+    file: "./dist/kokoro.js",
+    format: "esm",
+  },
+
+  // Web version
+  {
+    file: "./dist/kokoro.web.js",
+    format: "esm",
+  },
+];
+
+const WEB_SPECIFIC_CONFIG = {
+  onwarn: (warning, warn) => {
+    if (!warning.message.includes("@huggingface/transformers")) warn(warning);
+  },
+};
+
+const NODE_SPECIFIC_CONFIG = {
+  external: ["@huggingface/transformers", "phonemizer"],
+};
+
+export default OUTPUT_CONFIGS.map((output) => {
+  const web = output.file.endsWith(".web.js");
+  return {
+    input: "./src/kokoro.js",
+    output,
+    plugins: plugins(web),
+    ...(web ? WEB_SPECIFIC_CONFIG : NODE_SPECIFIC_CONFIG),
+  };
+});
--- a/kokoro.js/src/kokoro.js
+++ b/kokoro.js/src/kokoro.js
+import { StyleTextToSpeech2Model, AutoTokenizer, Tensor, RawAudio } from "@huggingface/transformers";
+import { phonemize } from "./phonemize.js";
+import { TextSplitterStream } from "./splitter.js";
+import { getVoiceData, VOICES } from "./voices.js";
+
+const STYLE_DIM = 256;
+const SAMPLE_RATE = 24000;
+
+/**
+ * @typedef {Object} GenerateOptions
+ * @property {keyof typeof VOICES} [voice="af_heart"] The voice
+ * @property {number} [speed=1] The speaking speed
+ */
+
+/**
+ * @typedef {Object} StreamProperties
+ * @property {RegExp} [split_pattern] The pattern to split the input text. If unset, the default sentence splitter will be used.
+ * @typedef {GenerateOptions & StreamProperties} StreamGenerateOptions
+ */
+
+export class KokoroTTS {
+  /**
+   * Create a new KokoroTTS instance.
+   * @param {import('@huggingface/transformers').StyleTextToSpeech2Model} model The model
+   * @param {import('@huggingface/transformers').PreTrainedTokenizer} tokenizer The tokenizer
+   */
+  constructor(model, tokenizer) {
+    this.model = model;
+    this.tokenizer = tokenizer;
+  }
+
+  /**
+   * Load a KokoroTTS model from the Hugging Face Hub.
+   * @param {string} model_id The model id
+   * @param {Object} options Additional options
+   * @param {"fp32"|"fp16"|"q8"|"q4"|"q4f16"} [options.dtype="fp32"] The data type to use.
+   * @param {"wasm"|"webgpu"|"cpu"|null} [options.device=null] The device to run the model on.
+   * @param {import("@huggingface/transformers").ProgressCallback} [options.progress_callback=null] A callback function that is called with progress information.
+   * @returns {Promise<KokoroTTS>} The loaded model
+   */
+  static async from_pretrained(model_id, { dtype = "fp32", device = null, progress_callback = null } = {}) {
+    const model = StyleTextToSpeech2Model.from_pretrained(model_id, { progress_callback, dtype, device });
+    const tokenizer = AutoTokenizer.from_pretrained(model_id, { progress_callback });
+
+    const info = await Promise.all([model, tokenizer]);
+    return new KokoroTTS(...info);
+  }
+
+  get voices() {
+    return VOICES;
+  }
+
+  list_voices() {
+    console.table(VOICES);
+  }
+
+  _validate_voice(voice) {
+    if (!VOICES.hasOwnProperty(voice)) {
+      console.error(`Voice "${voice}" not found. Available voices:`);
+      console.table(VOICES);
+      throw new Error(`Voice "${voice}" not found. Should be one of: ${Object.keys(VOICES).join(", ")}.`);
+    }
+    const language = /** @type {"a"|"b"} */ (voice.at(0)); // "a" or "b"
+    return language;
+  }
+
+  /**
+   * Generate audio from text.
+   *
+   * @param {string} text The input text
+   * @param {GenerateOptions} options Additional options
+   * @returns {Promise<RawAudio>} The generated audio
+   */
+  async generate(text, { voice = "af_heart", speed = 1 } = {}) {
+    const language = this._validate_voice(voice);
+
+    const phonemes = await phonemize(text, language);
+    const { input_ids } = this.tokenizer(phonemes, {
+      truncation: true,
+    });
+
+    return this.generate_from_ids(input_ids, { voice, speed });
+  }
+
+  /**
+   * Generate audio from input ids.
+   * @param {Tensor} input_ids The input ids
+   * @param {GenerateOptions} options Additional options
+   * @returns {Promise<RawAudio>} The generated audio
+   */
+  async generate_from_ids(input_ids, { voice = "af_heart", speed = 1 } = {}) {
+    // Select voice style based on number of input tokens
+    const num_tokens = Math.min(Math.max(input_ids.dims.at(-1) - 2, 0), 509);
+
+    // Load voice style
+    const data = await getVoiceData(voice);
+    const offset = num_tokens * STYLE_DIM;
+    const voiceData = data.slice(offset, offset + STYLE_DIM);
+
+    // Prepare model inputs
+    const inputs = {
+      input_ids,
+      style: new Tensor("float32", voiceData, [1, STYLE_DIM]),
+      speed: new Tensor("float32", [speed], [1]),
+    };
+
+    // Generate audio
+    const { waveform } = await this.model(inputs);
+    return new RawAudio(waveform.data, SAMPLE_RATE);
+  }
+
+  /**
+   * Generate audio from text in a streaming fashion.
+   * @param {string|TextSplitterStream} text The input text
+   * @param {StreamGenerateOptions} options Additional options
+   * @returns {AsyncGenerator<{text: string, phonemes: string, audio: RawAudio}, void, void>}
+   */
+  async *stream(text, { voice = "af_heart", speed = 1, split_pattern = null } = {}) {
+    const language = this._validate_voice(voice);
+
+    /** @type {TextSplitterStream} */
+    let splitter;
+    if (text instanceof TextSplitterStream) {
+      splitter = text;
+    } else if (typeof text === "string") {
+      splitter = new TextSplitterStream();
+      const chunks = split_pattern
+        ? text
+            .split(split_pattern)
+            .map((chunk) => chunk.trim())
+            .filter((chunk) => chunk.length > 0)
+        : [text];
+      splitter.push(...chunks);
+    } else {
+      throw new Error("Invalid input type. Expected string or TextSplitterStream.");
+    }
+    for await (const sentence of splitter) {
+      const phonemes = await phonemize(sentence, language);
+      const { input_ids } = this.tokenizer(phonemes, {
+        truncation: true,
+      });
+
+      // TODO: There may be some cases where - even with splitting - the text is too long.
+      // In that case, we should split the text into smaller chunks and process them separately.
+      // For now, we just truncate these exceptionally long chunks
+      const audio = await this.generate_from_ids(input_ids, { voice, speed });
+      yield { text: sentence, phonemes, audio };
+    }
+  }
+}
+
+export { TextSplitterStream };
--- a/kokoro.js/src/phonemize.js
+++ b/kokoro.js/src/phonemize.js
+import { phonemize as espeakng } from "phonemizer";
+
+/**
+ * Helper function to split a string on a regex, but keep the delimiters.
+ * This is required, because the JavaScript `.split()` method does not keep the delimiters,
+ * and wrapping in a capturing group causes issues with existing capturing groups (due to nesting).
+ * @param {string} text The text to split.
+ * @param {RegExp} regex The regex to split on.
+ * @returns {{match: boolean; text: string}[]} The split string.
+ */
+function split(text, regex) {
+  const result = [];
+  let prev = 0;
+  for (const match of text.matchAll(regex)) {
+    const fullMatch = match[0];
+    if (prev < match.index) {
+      result.push({ match: false, text: text.slice(prev, match.index) });
+    }
+    if (fullMatch.length > 0) {
+      result.push({ match: true, text: fullMatch });
+    }
+    prev = match.index + fullMatch.length;
+  }
+  if (prev < text.length) {
+    result.push({ match: false, text: text.slice(prev) });
+  }
+  return result;
+}
+
+/**
+ * Helper function to split numbers into phonetic equivalents
+ * @param {string} match The matched number
+ * @returns {string} The phonetic equivalent
+ */
+function split_num(match) {
+  if (match.includes(".")) {
+    return match;
+  } else if (match.includes(":")) {
+    let [h, m] = match.split(":").map(Number);
+    if (m === 0) {
+      return `${h} o'clock`;
+    } else if (m < 10) {
+      return `${h} oh ${m}`;
+    }
+    return `${h} ${m}`;
+  }
+  let year = parseInt(match.slice(0, 4), 10);
+  if (year < 1100 || year % 1000 < 10) {
+    return match;
+  }
+  let left = match.slice(0, 2);
+  let right = parseInt(match.slice(2, 4), 10);
+  let suffix = match.endsWith("s") ? "s" : "";
+  if (year % 1000 >= 100 && year % 1000 <= 999) {
+    if (right === 0) {
+      return `${left} hundred${suffix}`;
+    } else if (right < 10) {
+      return `${left} oh ${right}${suffix}`;
+    }
+  }
+  return `${left} ${right}${suffix}`;
+}
+
+/**
+ * Helper function to format monetary values
+ * @param {string} match The matched currency
+ * @returns {string} The formatted currency
+ */
+function flip_money(match) {
+  const bill = match[0] === "$" ? "dollar" : "pound";
+  if (isNaN(Number(match.slice(1)))) {
+    return `${match.slice(1)} ${bill}s`;
+  } else if (!match.includes(".")) {
+    let suffix = match.slice(1) === "1" ? "" : "s";
+    return `${match.slice(1)} ${bill}${suffix}`;
+  }
+  const [b, c] = match.slice(1).split(".");
+  const d = parseInt(c.padEnd(2, "0"), 10);
+  let coins = match[0] === "$" ? (d === 1 ? "cent" : "cents") : d === 1 ? "penny" : "pence";
+  return `${b} ${bill}${b === "1" ? "" : "s"} and ${d} ${coins}`;
+}
+
+/**
+ * Helper function to process decimal numbers
+ * @param {string} match The matched number
+ * @returns {string} The formatted number
+ */
+function point_num(match) {
+  let [a, b] = match.split(".");
+  return `${a} point ${b.split("").join(" ")}`;
+}
+
+/**
+ * Normalize text for phonemization
+ * @param {string} text The text to normalize
+ * @returns {string} The normalized text
+ */
+function normalize_text(text) {
+  return (
+    text
+      // 1. Handle quotes and brackets
+      .replace(/[‘’]/g, "'")
+      .replace(/«/g, "“")
+      .replace(/»/g, "”")
+      .replace(/[“”]/g, '"')
+      .replace(/\(/g, "«")
+      .replace(/\)/g, "»")
+
+      // 2. Replace uncommon punctuation marks
+      .replace(/、/g, ", ")
+      .replace(/。/g, ". ")
+      .replace(/！/g, "! ")
+      .replace(/，/g, ", ")
+      .replace(/：/g, ": ")
+      .replace(/；/g, "; ")
+      .replace(/？/g, "? ")
+
+      // 3. Whitespace normalization
+      .replace(/[^\S \n]/g, " ")
+      .replace(/  +/, " ")
+      .replace(/(?<=\n) +(?=\n)/g, "")
+
+      // 4. Abbreviations
+      .replace(/\bD[Rr]\.(?= [A-Z])/g, "Doctor")
+      .replace(/\b(?:Mr\.|MR\.(?= [A-Z]))/g, "Mister")
+      .replace(/\b(?:Ms\.|MS\.(?= [A-Z]))/g, "Miss")
+      .replace(/\b(?:Mrs\.|MRS\.(?= [A-Z]))/g, "Mrs")
+      .replace(/\betc\.(?! [A-Z])/gi, "etc")
+
+      // 5. Normalize casual words
+      .replace(/\b(y)eah?\b/gi, "$1e'a")
+
+      // 5. Handle numbers and currencies
+      .replace(/\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)/g, split_num)
+      .replace(/(?<=\d),(?=\d)/g, "")
+      .replace(/[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b/gi, flip_money)
+      .replace(/\d*\.\d+/g, point_num)
+      .replace(/(?<=\d)-(?=\d)/g, " to ")
+      .replace(/(?<=\d)S/g, " S")
+
+      // 6. Handle possessives
+      .replace(/(?<=[BCDFGHJ-NP-TV-Z])'?s\b/g, "'S")
+      .replace(/(?<=X')S\b/g, "s")
+
+      // 7. Handle hyphenated words/letters
+      .replace(/(?:[A-Za-z]\.){2,} [a-z]/g, (m) => m.replace(/\./g, "-"))
+      .replace(/(?<=[A-Z])\.(?=[A-Z])/gi, "-")
+
+      // 8. Strip leading and trailing whitespace
+      .trim()
+  );
+}
+
+/**
+ * Escapes regular expression special characters from a string by replacing them with their escaped counterparts.
+ *
+ * @param {string} string The string to escape.
+ * @returns {string} The escaped string.
+ */
+function escapeRegExp(string) {
+  return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
+}
+
+const PUNCTUATION = ';:,.!?¡¿—…"«»“”(){}[]';
+const PUNCTUATION_PATTERN = new RegExp(`(\\s*[${escapeRegExp(PUNCTUATION)}]+\\s*)+`, "g");
+
+/**
+ * Phonemize text using the eSpeak-NG phonemizer
+ * @param {string} text The text to phonemize
+ * @param {"a"|"b"} language The language to use
+ * @param {boolean} norm Whether to normalize the text
+ * @returns {Promise<string>} The phonemized text
+ */
+export async function phonemize(text, language = "a", norm = true) {
+  // 1. Normalize text
+  if (norm) {
+    text = normalize_text(text);
+  }
+
+  // 2. Split into chunks, to ensure we preserve punctuation
+  const sections = split(text, PUNCTUATION_PATTERN);
+
+  // 3. Convert each section to phonemes
+  const lang = language === "a" ? "en-us" : "en";
+  const ps = (await Promise.all(sections.map(async ({ match, text }) => (match ? text : (await espeakng(text, lang)).join(" "))))).join("");
+
+  // 4. Post-process phonemes
+  let processed = ps
+    // https://en.wiktionary.org/wiki/kokoro#English
+    .replace(/kəkˈoːɹoʊ/g, "kˈoʊkəɹoʊ")
+    .replace(/kəkˈɔːɹəʊ/g, "kˈəʊkəɹəʊ")
+    .replace(/ʲ/g, "j")
+    .replace(/r/g, "ɹ")
+    .replace(/x/g, "k")
+    .replace(/ɬ/g, "l")
+    .replace(/(?<=[a-zɹː])(?=hˈʌndɹɪd)/g, " ")
+    .replace(/ z(?=[;:,.!?¡¿—…"«»“” ]|$)/g, "z");
+
+  // 5. Additional post-processing for American English
+  if (language === "a") {
+    processed = processed.replace(/(?<=nˈaɪn)ti(?!ː)/g, "di");
+  }
+  return processed.trim();
+}
--- a/kokoro.js/src/splitter.js
+++ b/kokoro.js/src/splitter.js
+/**
+ * Returns true if the character is considered a sentence terminator.
+ * This includes ASCII (".", "!", "?") and common Unicode terminators.
+ * NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
+ * @param {string} c The character to test.
+ * @param {boolean} [includeNewlines=true] Whether to treat newlines as terminators.
+ * @returns {boolean}
+ */
+function isSentenceTerminator(c, includeNewlines = true) {
+  return ".!?…。？！".includes(c) || (includeNewlines && c === "\n");
+}
+
+/**
+ * Returns true if the character should be attached to the sentence terminator,
+ * such as closing quotes or brackets.
+ * @param {string} c The character to test.
+ * @returns {boolean}
+ */
+function isTrailingChar(c) {
+  return "\"')]}」』".includes(c);
+}
+
+/**
+ * Extracts a token (a contiguous sequence of non–whitespace characters)
+ * from the buffer starting at the given index.
+ * @param {string} buffer The input text.
+ * @param {number} start The starting index.
+ * @returns {string} The extracted token.
+ */
+function getTokenFromBuffer(buffer, start) {
+  let end = start;
+  while (end < buffer.length && !/\s/.test(buffer[end])) {
+    ++end;
+  }
+  return buffer.substring(start, end);
+}
+
+// List of common abbreviations. Note that strings with single letters joined by periods
+// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
+const ABBREVIATIONS = new Set(["mr", "mrs", "ms", "dr", "prof", "sr", "jr", "sgt", "col", "gen", "rep", "sen", "gov", "lt", "maj", "capt", "st", "mt", "etc", "co", "inc", "ltd", "dept", "vs", "p", "pg", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec", "sun", "mon", "tu", "tue", "tues", "wed", "th", "thu", "thur", "thurs", "fri", "sat"]);
+
+/**
+ * Determines if the given token (or series of initials) is a known abbreviation.
+ * @param {string} token The token to check.
+ * @returns {boolean}
+ */
+function isAbbreviation(token) {
+  // Remove possessive endings and trailing periods.
+  token = token.replace(/['’]s$/i, "").replace(/\.+$/, "");
+  return ABBREVIATIONS.has(token.toLowerCase());
+}
+
+// Map of closing punctuation to their corresponding opening punctuation.
+const MATCHING = new Map([
+  [")", "("],
+  ["]", "["],
+  ["}", "{"],
+  ["》", "《"],
+  ["〉", "〈"],
+  ["›", "‹"],
+  ["»", "«"],
+  ["〉", "〈"],
+  ["」", "「"],
+  ["』", "『"],
+  ["〕", "〔"],
+  ["】", "【"],
+]);
+// Set of opening punctuation characters.
+const OPENING = new Set(MATCHING.values());
+
+/**
+ * Updates the nesting stack to track quotes and paired punctuation.
+ * This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
+ * (An apostrophe between letters is ignored so that contractions remain intact.)
+ * @param {string} c The current character.
+ * @param {string[]} stack The current nesting stack.
+ * @param {number} i The index of the character in the buffer.
+ * @param {string} buffer The full text being processed.
+ */
+function updateStack(c, stack, i, buffer) {
+  // Handle standard quotes.
+  if (c === '"' || c === "'") {
+    // Ignore an apostrophe if it's between letters (e.g., in contractions).
+    if (c === "'" && i > 0 && i < buffer.length - 1 && /[A-Za-z]/.test(buffer[i - 1]) && /[A-Za-z]/.test(buffer[i + 1])) {
+      return;
+    }
+    if (stack.length && stack.at(-1) === c) {
+      stack.pop();
+    } else {
+      stack.push(c);
+    }
+    return;
+  }
+  // Handle opening punctuation.
+  if (OPENING.has(c)) {
+    stack.push(c);
+    return;
+  }
+  // Handle closing punctuation.
+  const expectedOpening = MATCHING.get(c);
+  if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
+    stack.pop();
+  }
+}
+
+/**
+ * A simple stream-based text splitter that emits complete sentences.
+ */
+export class TextSplitterStream {
+  constructor() {
+    this._buffer = "";
+    this._sentences = [];
+    this._resolver = null;
+    this._closed = false;
+  }
+
+  /**
+   * Push one or more text chunks into the stream.
+   * @param  {...string} texts Text fragments to process.
+   */
+  push(...texts) {
+    for (const txt of texts) {
+      this._buffer += txt;
+      this._process();
+    }
+  }
+
+  /**
+   * Closes the stream, signaling that no more text will be pushed.
+   * This will flush any remaining text in the buffer as a sentence
+   * and allow the consuming process to finish processing the stream.
+   */
+  close() {
+    if (this._closed) {
+      throw new Error("Stream is already closed.");
+    }
+    this._closed = true;
+    this.flush();
+  }
+
+  /**
+   * Flushes any remaining text in the buffer as a sentence.
+   */
+  flush() {
+    const remainder = this._buffer.trim();
+    if (remainder.length > 0) {
+      this._sentences.push(remainder);
+    }
+    this._buffer = "";
+    this._resolve();
+  }
+
+  /**
+   * Resolve the pending promise to signal that sentences are available.
+   * @private
+   */
+  _resolve() {
+    if (this._resolver) {
+      this._resolver();
+      this._resolver = null;
+    }
+  }
+
+  /**
+   * Processes the internal buffer to extract complete sentences.
+   * If the potential sentence boundary is at the end of the current buffer,
+   * it waits for more text before splitting.
+   * @private
+   */
+  _process() {
+    let sentenceStart = 0;
+    const buffer = this._buffer;
+    const len = buffer.length;
+    let i = 0;
+    let stack = [];
+
+    // Helper to scan from the current index over trailing terminators and punctuation.
+    const scanBoundary = (idx) => {
+      let end = idx;
+      // Consume contiguous sentence terminators (excluding newlines).
+      while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
+        ++end;
+      }
+      // Consume trailing characters (e.g., closing quotes/brackets).
+      while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
+        ++end;
+      }
+      let nextNonSpace = end + 1;
+      while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
+        ++nextNonSpace;
+      }
+      return { end, nextNonSpace };
+    };
+
+    while (i < len) {
+      const c = buffer[i];
+      updateStack(c, stack, i, buffer);
+
+      // Only consider splitting if we're not inside any nested structure.
+      if (stack.length === 0 && isSentenceTerminator(c)) {
+        const currentSegment = buffer.slice(sentenceStart, i);
+        // Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
+        if (/(^|\n)\d+$/.test(currentSegment)) {
+          ++i;
+          continue;
+        }
+
+        const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
+
+        // If the terminator is not a newline and there's no extra whitespace,
+        // we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
+        if (i === nextNonSpace - 1 && c !== "\n") {
+          ++i;
+          continue;
+        }
+
+        // Wait for more text if there's no non-whitespace character yet.
+        if (nextNonSpace === len) {
+          break;
+        }
+
+        // Determine the token immediately preceding the terminator.
+        let tokenStart = i - 1;
+        while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
+          tokenStart--;
+        }
+        tokenStart = Math.max(sentenceStart, tokenStart + 1);
+        const token = getTokenFromBuffer(buffer, tokenStart);
+        if (!token) {
+          ++i;
+          continue;
+        }
+
+        // --- URL/email protection ---
+        // If the token appears to be a URL or email (contains "://" or "@")
+        // and does not already end with a terminator, skip splitting.
+        if ((/https?[,:]\/\//.test(token) || token.includes("@")) && !isSentenceTerminator(token.at(-1))) {
+          i = tokenStart + token.length;
+          continue;
+        }
+
+        // --- Abbreviation protection ---
+        if (isAbbreviation(token)) {
+          ++i;
+          continue;
+        }
+
+        // --- Middle initials heuristic ---
+        // If the token is a series of single-letter initials (each ending in a period)
+        // and is followed by a capitalized word, assume it's part of a name.
+        if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
+          ++i;
+          continue;
+        }
+
+        // --- Lookahead heuristic ---
+        // If the terminator is a period and the next non–whitespace character is lowercase,
+        // assume it is not the end of a sentence.
+        if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
+          ++i;
+          continue;
+        }
+
+        // Special case: ellipsis that stands alone should be merged with the following sentence.
+        const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
+        if (sentence === "..." || sentence === "…") {
+          ++i;
+          continue;
+        }
+
+        // Accept the sentence boundary.
+        if (sentence) {
+          this._sentences.push(sentence);
+        }
+        // Move to the next sentence.
+        i = sentenceStart = boundaryEnd + 1;
+        continue;
+      }
+      ++i;
+    }
+
+    // Remove the processed portion of the buffer.
+    this._buffer = buffer.substring(sentenceStart);
+
+    // Resolve any pending promise if sentences are available.
+    if (this._sentences.length > 0) {
+      this._resolve();
+    }
+  }
+
+  /**
+   * Async iterator to yield sentences as they become available.
+   * @returns {AsyncGenerator<string, void, void>}
+   */
+  async *[Symbol.asyncIterator]() {
+    if (this._resolver) {
+      throw new Error("Another iterator is already active.");
+    }
+    while (true) {
+      if (this._sentences.length > 0) {
+        yield this._sentences.shift();
+      } else if (this._closed) {
+        // No more text will be pushed.
+        break;
+      } else {
+        // Wait for more text.
+        await new Promise((resolve) => {
+          this._resolver = resolve;
+        });
+      }
+    }
+  }
+
+  /**
+   * Synchronous iterator that flushes the buffer and returns all sentences.
+   * @returns {Iterator<string>}
+   */
+  [Symbol.iterator]() {
+    this.flush();
+    const iterator = this._sentences[Symbol.iterator]();
+    this._sentences = [];
+    return iterator;
+  }
+
+  /**
+   * Returns the array of sentences currently available.
+   * @type {string[]} The array of sentences.
+   * @readonly
+   */
+  get sentences() {
+    return this._sentences;
+  }
+}
+
+/**
+ * Splits the input text into an array of sentences.
+ * @param {string} text The text to split.
+ * @returns {string[]} An array of sentences.
+ */
+export function split(text) {
+  const splitter = new TextSplitterStream();
+  splitter.push(text);
+  return [...splitter];
+}
--- a/kokoro.js/src/voices.js
+++ b/kokoro.js/src/voices.js
+import path from "path";
+import fs from "fs/promises";
+
+export const VOICES = Object.freeze({
+  af_heart: {
+    name: "Heart",
+    language: "en-us",
+    gender: "Female",
+    traits: "❤️",
+    targetQuality: "A",
+    overallGrade: "A",
+  },
+  af_alloy: {
+    name: "Alloy",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "B",
+    overallGrade: "C",
+  },
+  af_aoede: {
+    name: "Aoede",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "B",
+    overallGrade: "C+",
+  },
+  af_bella: {
+    name: "Bella",
+    language: "en-us",
+    gender: "Female",
+    traits: "🔥",
+    targetQuality: "A",
+    overallGrade: "A-",
+  },
+  af_jessica: {
+    name: "Jessica",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  af_kore: {
+    name: "Kore",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "B",
+    overallGrade: "C+",
+  },
+  af_nicole: {
+    name: "Nicole",
+    language: "en-us",
+    gender: "Female",
+    traits: "🎧",
+    targetQuality: "B",
+    overallGrade: "B-",
+  },
+  af_nova: {
+    name: "Nova",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "B",
+    overallGrade: "C",
+  },
+  af_river: {
+    name: "River",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  af_sarah: {
+    name: "Sarah",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "B",
+    overallGrade: "C+",
+  },
+  af_sky: {
+    name: "Sky",
+    language: "en-us",
+    gender: "Female",
+    targetQuality: "B",
+    overallGrade: "C-",
+  },
+  am_adam: {
+    name: "Adam",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "D",
+    overallGrade: "F+",
+  },
+  am_echo: {
+    name: "Echo",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  am_eric: {
+    name: "Eric",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  am_fenrir: {
+    name: "Fenrir",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "B",
+    overallGrade: "C+",
+  },
+  am_liam: {
+    name: "Liam",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  am_michael: {
+    name: "Michael",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "B",
+    overallGrade: "C+",
+  },
+  am_onyx: {
+    name: "Onyx",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  am_puck: {
+    name: "Puck",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "B",
+    overallGrade: "C+",
+  },
+  am_santa: {
+    name: "Santa",
+    language: "en-us",
+    gender: "Male",
+    targetQuality: "C",
+    overallGrade: "D-",
+  },
+  bf_emma: {
+    name: "Emma",
+    language: "en-gb",
+    gender: "Female",
+    traits: "🚺",
+    targetQuality: "B",
+    overallGrade: "B-",
+  },
+  bf_isabella: {
+    name: "Isabella",
+    language: "en-gb",
+    gender: "Female",
+    targetQuality: "B",
+    overallGrade: "C",
+  },
+  bm_george: {
+    name: "George",
+    language: "en-gb",
+    gender: "Male",
+    targetQuality: "B",
+    overallGrade: "C",
+  },
+  bm_lewis: {
+    name: "Lewis",
+    language: "en-gb",
+    gender: "Male",
+    targetQuality: "C",
+    overallGrade: "D+",
+  },
+  bf_alice: {
+    name: "Alice",
+    language: "en-gb",
+    gender: "Female",
+    traits: "🚺",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  bf_lily: {
+    name: "Lily",
+    language: "en-gb",
+    gender: "Female",
+    traits: "🚺",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  bm_daniel: {
+    name: "Daniel",
+    language: "en-gb",
+    gender: "Male",
+    traits: "🚹",
+    targetQuality: "C",
+    overallGrade: "D",
+  },
+  bm_fable: {
+    name: "Fable",
+    language: "en-gb",
+    gender: "Male",
+    traits: "🚹",
+    targetQuality: "B",
+    overallGrade: "C",
+  },
+
+  // TODO: Add support for other languages:
+  // jf_alpha: {
+  //   name: "alpha",
+  //   language: "ja",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "C+",
+  // },
+  // jf_gongitsune: {
+  //   name: "gongitsune",
+  //   language: "ja",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // jf_nezumi: {
+  //   name: "nezumi",
+  //   language: "ja",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "C-",
+  // },
+  // jf_tebukuro: {
+  //   name: "tebukuro",
+  //   language: "ja",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // jm_kumo: {
+  //   name: "kumo",
+  //   language: "ja",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "B",
+  //   overallGrade: "C-",
+  // },
+  // zf_xiaobei: {
+  //   name: "xiaobei",
+  //   language: "zh",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // zf_xiaoni: {
+  //   name: "xiaoni",
+  //   language: "zh",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // zf_xiaoxiao: {
+  //   name: "xiaoxiao",
+  //   language: "zh",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // zf_xiaoyi: {
+  //   name: "xiaoyi",
+  //   language: "zh",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // zm_yunjian: {
+  //   name: "yunjian",
+  //   language: "zh",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // zm_yunxi: {
+  //   name: "yunxi",
+  //   language: "zh",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // zm_yunxia: {
+  //   name: "yunxia",
+  //   language: "zh",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // zm_yunyang: {
+  //   name: "yunyang",
+  //   language: "zh",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // ef_dora: {
+  //   name: "dora",
+  //   language: "es",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // em_alex: {
+  //   name: "alex",
+  //   language: "es",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // em_santa: {
+  //   name: "santa",
+  //   language: "es",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // ff_siwis: {
+  //   name: "siwis",
+  //   language: "es",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "B-",
+  // },
+  // hf_alpha: {
+  //   name: "alpha",
+  //   language: "hi",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // hf_beta: {
+  //   name: "beta",
+  //   language: "hi",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // hm_omega: {
+  //   name: "omega",
+  //   language: "hi",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // hm_psi: {
+  //   name: "psi",
+  //   language: "hi",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // if_sara: {
+  //   name: "sara",
+  //   language: "it",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // im_nicola: {
+  //   name: "nicola",
+  //   language: "it",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "B",
+  //   overallGrade: "C",
+  // },
+  // pf_dora: {
+  //   name: "dora",
+  //   language: "pt-br",
+  //   gender: "Female",
+  //   traits: "🚺",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // pm_alex: {
+  //   name: "alex",
+  //   language: "pt-br",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+  // pm_santa: {
+  //   name: "santa",
+  //   language: "pt-br",
+  //   gender: "Male",
+  //   traits: "🚹",
+  //   targetQuality: "C",
+  //   overallGrade: "D",
+  // },
+});
+
+const VOICE_DATA_URL = "https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main/voices";
+
+/**
+ *
+ * @param {keyof typeof VOICES} id
+ * @returns {Promise<ArrayBufferLike>}
+ */
+async function getVoiceFile(id) {
+  if (fs?.readFile) {
+    const dirname = typeof __dirname !== "undefined" ? __dirname : import.meta.dirname;
+    const file = path.resolve(dirname, `../voices/${id}.bin`);
+    const { buffer } = await fs.readFile(file);
+    return buffer;
+  }
+
+  const url = `${VOICE_DATA_URL}/${id}.bin`;
+
+  let cache;
+  try {
+    cache = await caches.open("kokoro-voices");
+    const cachedResponse = await cache.match(url);
+    if (cachedResponse) {
+      return await cachedResponse.arrayBuffer();
+    }
+  } catch (e) {
+    console.warn("Unable to open cache", e);
+  }
+
+  // No cache, or cache failed to open. Fetch the file.
+  const response = await fetch(url);
+  const buffer = await response.arrayBuffer();
+
+  if (cache) {
+    try {
+      // NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files
+      await cache.put(
+        url,
+        new Response(buffer, {
+          headers: response.headers,
+        }),
+      );
+    } catch (e) {
+      console.warn("Unable to cache file", e);
+    }
+  }
+
+  return buffer;
+}
+
+const VOICE_CACHE = new Map();
+export async function getVoiceData(voice) {
+  if (VOICE_CACHE.has(voice)) {
+    return VOICE_CACHE.get(voice);
+  }
+
+  const buffer = new Float32Array(await getVoiceFile(voice));
+  VOICE_CACHE.set(voice, buffer);
+  return buffer;
+}
--- a/kokoro.js/tests/phonemize.test.js
+++ b/kokoro.js/tests/phonemize.test.js
+import { describe, test, expect } from "vitest";
+import { phonemize } from "../src/phonemize.js";
+
+const A_TEST_CASES = new Map([
+  ["‘Hello’", "həlˈoʊ"],
+  ["‘Test’ and ‘Example’", "tˈɛst ænd ɛɡzˈæmpəl"],
+  ["«Bonjour»", '"bɔːnʒˈʊɹ"'],
+  ["«Test «nested» quotes»", '"tˈɛst "nˈɛstᵻd" kwˈoʊts"'],
+  ["(Hello)", "«həlˈoʊ»"],
+  ["(Nested (Parentheses))", "«nˈɛstᵻd «pɚɹˈɛnθəsˌiːz»»"],
+  ["こんにちは、世界！", "dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ, tʃˈaɪniːzlˌɛɾɚ tʃˈaɪniːzlˌɛɾɚ!"],
+  ["これはテストです：はい？", "dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ: dʒˈæpəniːzlˌɛɾɚ dʒˈæpəniːzlˌɛɾɚ?"],
+  ["Hello World", "həlˈoʊ wˈɜːld"],
+  ["Hello   World", "həlˈoʊ wˈɜːld"],
+  ["Hello\n   \nWorld", "həlˈoʊ wˈɜːld"],
+  ["Dr. Smith", "dˈɑːktɚ smˈɪθ"],
+  ["DR. Brown", "dˈɑːktɚ bɹˈaʊn"],
+  ["Mr. Smith", "mˈɪstɚ smˈɪθ"],
+  ["MR. Anderson", "mˈɪstɚɹ ˈændɚsən"],
+  ["Ms. Taylor", "mˈɪs tˈeɪlɚ"],
+  ["MS. Carter", "mˈɪs kˈɑːɹɾɚ"],
+  ["Mrs. Johnson", "mˈɪsɪz dʒˈɑːnsən"],
+  ["MRS. Wilson", "mˈɪsɪz wˈɪlsən"],
+  ["Apples, oranges, etc.", "ˈæpəlz, ˈɔɹɪndʒᵻz, ɛtsˈɛtɹə"],
+  ["Apples, etc. Pears.", "ˈæpəlz, ɛtsˈɛtɹə. pˈɛɹz."],
+  ["Yeah", "jˈɛə"],
+  ["yeah", "jˈɛə"],
+  ["1990", "nˈaɪntiːn nˈaɪndi"],
+  ["12:34", "twˈɛlv θˈɜːɾi fˈoːɹ"],
+  ["2022s", "twˈɛnti twˈɛnti tˈuːz"],
+  ["1,000", "wˈʌn θˈaʊzənd"],
+  ["12,345,678", "twˈɛlv mˈɪliən θɹˈiː hˈʌndɹɪd fˈoːɹɾi fˈaɪv θˈaʊzənd sˈɪks hˈʌndɹɪd sˈɛvənti ˈeɪt"],
+  ["$100", "wˈʌn hˈʌndɹɪd dˈɑːlɚz"],
+  ["£1.50", "wˈʌn pˈaʊnd ænd fˈɪfti pˈɛns"],
+  ["12.34", "twˈɛlv pˈɔɪnt θɹˈiː fˈoːɹ"],
+  ["0.01", "zˈiəɹoʊ pˈɔɪnt zˈiəɹoʊ wˈʌn"],
+  ["10-20", "tˈɛn tə twˈɛnti"],
+  ["5-10", "fˈaɪv tə tˈɛn"],
+  ["10S", "tˈɛn ˈɛs"],
+  ["5S", "fˈaɪv ˈɛs"],
+  ["Cat's tail", "kˈæts tˈeɪl"],
+  ["X's mark", "ˈɛksᵻz mˈɑːɹk"],
+  ["U.S.A.", "jˈuːˈɛsˈeɪ."],
+  ["A.B.C", "ˈeɪbˈiːsˈiː"],
+]);
+
+const B_TEST_CASES = new Map([
+  ["‘Hello’", "həlˈəʊ"],
+  ["‘Test’ and ‘Example’", "tˈɛst and ɛɡzˈampəl"],
+  ["«Bonjour»", '"bɔːnʒˈʊə"'],
+  ["«Test «nested» quotes»", '"tˈɛst "nˈɛstɪd" kwˈəʊts"'],
+  ["(Hello)", "«həlˈəʊ»"],
+  ["(Nested (Parentheses))", "«nˈɛstɪd «pəɹˈɛnθəsˌiːz»»"],
+  ["こんにちは、世界！", "dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə, tʃˈaɪniːzlˌɛtə tʃˈaɪniːzlˌɛtə!"],
+  ["これはテストです：はい？", "dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə: dʒˈapəniːzlˌɛtə dʒˈapəniːzlˌɛtə?"],
+  ["Hello World", "həlˈəʊ wˈɜːld"],
+  ["Hello   World", "həlˈəʊ wˈɜːld"],
+  ["Hello\n   \nWorld", "həlˈəʊ wˈɜːld"],
+  ["Dr. Smith", "dˈɒktə smˈɪθ"],
+  ["DR. Brown", "dˈɒktə bɹˈaʊn"],
+  ["Mr. Smith", "mˈɪstə smˈɪθ"],
+  ["MR. Anderson", "mˈɪstəɹ ˈandəsən"],
+  ["Ms. Taylor", "mˈɪs tˈeɪlə"],
+  ["MS. Carter", "mˈɪs kˈɑːtə"],
+  ["Mrs. Johnson", "mˈɪsɪz dʒˈɒnsən"],
+  ["Apples, oranges, etc.", "ˈapəlz, ˈɒɹɪndʒɪz, ɛtsˈɛtɹə"],
+  ["Apples, etc. Pears.", "ˈapəlz, ɛtsˈɛtɹə. pˈeəz."],
+  ["1990", "nˈaɪntiːn nˈaɪnti"],
+  ["12:34", "twˈɛlv θˈɜːti fˈɔː"],
+  ["1,000", "wˈɒn θˈaʊzənd"],
+  ["12,345,678", "twˈɛlv mˈɪliən θɹˈiː hˈʌndɹɪdən fˈɔːti fˈaɪv θˈaʊzənd sˈɪks hˈʌndɹɪdən sˈɛvənti ˈeɪt"],
+  ["$100", "wˈɒn hˈʌndɹɪd dˈɒləz"],
+  ["£1.50", "wˈɒn pˈaʊnd and fˈɪfti pˈɛns"],
+  ["12.34", "twˈɛlv pˈɔɪnt θɹˈiː fˈɔː"],
+  ["0.01", "zˈiəɹəʊ pˈɔɪnt zˈiəɹəʊ wˈɒn"],
+  ["Cat's tail", "kˈats tˈeɪl"],
+  ["X's mark", "ˈɛksɪz mˈɑːk"],
+]);
+
+describe("phonemize", () => {
+  describe("en-us", () => {
+    for (const [input, expected] of A_TEST_CASES) {
+      test(`phonemize("${input}")`, async () => {
+        expect(await phonemize(input)).toEqual(expected);
+      });
+    }
+  });
+  describe("en-gb", () => {
+    for (const [input, expected] of B_TEST_CASES) {
+      test(`phonemize("${input}")`, async () => {
+        expect(await phonemize(input, "b")).toEqual(expected);
+      });
+    }
+  });
+});