abc.json 15.5 KB
Newer Older
wangkx1's avatar
wangkx1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
{
    "AigcAttributes": "{}",
    "AigcIsTop": 0,
    "AigcType": "",
    "AlreadyStar": False,
    "ApplyMeta": "{}",
    "ApprovalMode": 1,
    "ApprovalNotifyEmail": "",
    "Architectures": [],
    "Avatar": "https://img.alicdn.com/imgextra/i1/O1CN01yhHrHg1Pdl3UKPhGc_!!6000000001864-2-tps-88-88.png",
    "Backbone": [],
    "BackendSupport": {
        "architectures": None,
        "backend_info": {
            "deploy_task": None,
            "lmdeploy": None,
            "lmdeploy_turbomind": None,
            "ollama": None,
            "sglang": None,
            "vllm": None
        },
        "model_id": "OpenBMB/VoxCPM2"
    },
    "BaseModel": [],
    "BaseModelRelation": "",
    "CardReady": 0,
    "CardUnreadyReason": "",
    "CertificationCreateBy": "",
    "CertificationCreatedTime": -62135596800,
    "ChineseName": "VoxCPM2",
    "CoverImages": [],
    "CreatedBy": "LabmemZhouyx",
    "CreatedTime": 1775194576,
    "DashSdkParameter": "",
    "Datasets": {},
    "DemoAvailable": 0,
    "DemoUnavailableReason": "",
    "Description": "",
    "Domain": [],
    "Downloads": 5067,
    "ExampleCodeAvailable": 0,
    "ExampleCodeUnavailableReason": "",
    "ForbiddenVisibilityUpdate": False,
    "Frameworks": [],
    "FromSite": "maas",
    "Id": 1125292,
    "Integrating": 0,
    "IntegrationFailureLog": "",
    "IntegrationFailureReason": "",
    "IsAccessible": 1,
    "IsCertification": 4,
    "IsHot": 0,
    "IsNewModel": True,
    "IsOnline": 1,
    "IsPreTrain": 0,
    "IsPublished": 1,
    "IsTop": 41,
    "Language": [
        "zh",
        "en",
        "ar",
        "my",
        "da",
        "nl",
        "fi",
        "fr",
        "de",
        "el",
        "he",
        "hi",
        "id",
        "it",
        "ja",
        "km",
        "ko",
        "lo",
        "ms",
        "no",
        "pl",
        "pt",
        "ru",
        "es",
        "sw",
        "sv",
        "tl",
        "th",
        "tr",
        "vi"
    ],
    "LastUpdatedTime": 1775645023,
    "Libraries": [
        "safetensors"
    ],
    "License": "apache-2.0",
    "Meta": "",
    "Metrics": [],
    "ModelDetail": {},
    "ModelInfos": {
        "safetensor": {
            "chat_template": "{% for message in messages %}{{"<|im_start|>" + message["role"] + "\n" + message["content"] + "<|im_end|>" + "\n"}}{% endfor %}{% if add_generation_prompt %}{{ "<|im_start|>assistant\n" }}{% endif %}",
            "files": [
                {
                    "name": "model.safetensors",
                    "sha256": "f7f964cfa9da23653baec6e6f7750719977ad944ed9f95fe52fe3a620506891d",
                    "size": 4580080592
                }
            ],
            "model_size": 2290004544,
            "tensor_type": [
                "BF16"
            ]
        }
    },
    "ModelRevisions": None,
    "ModelSource": "USER_UPLOAD",
    "ModelTools": "",
    "ModelType": [],
    "MuseInfo": None,
    "NEXA": {
        "Catalogues": None,
        "ModelCover": "",
        "ScientificField": "",
        "Source": "",
        "SubScientificField": None
    },
    "Name": "VoxCPM2",
    "NewVersion": "",
    "NickName": "",
    "OfficialTags": None,
    "OpenAiSwingDeployInfo": {
        "Order": 0,
        "Recommend": None,
        "lmdeploy": {
            "eas": {
                "Script": "",
                "requirements": ""
            },
            "ens": {
                "Script": "",
                "requirements": ""
            },
            "fc": {
                "Script": "",
                "requirements": ""
            },
            "image_tag": ""
        },
        "ollama": {
            "eas": {
                "Script": "",
                "requirements": ""
            },
            "ens": {
                "Script": "",
                "requirements": ""
            },
            "fc": {
                "Script": "",
                "requirements": ""
            },
            "image_tag": ""
        },
        "pipeline": {
            "eas": {
                "Script": "",
                "requirements": ""
            },
            "ens": {
                "Script": "",
                "requirements": ""
            },
            "fc": {
                "Script": "",
                "requirements": ""
            },
            "image_tag": ""
        },
        "vllm": {
            "eas": {
                "Script": "",
                "requirements": ""
            },
            "ens": {
                "Script": "",
                "requirements": ""
            },
            "fc": {
                "Script": "",
                "requirements": ""
            },
            "image_tag": ""
        }
    },
    "Organization": {
        "ApplyFailureReason": "",
        "ApplyReason": "",
        "Avatar": "https://resouces.modelscope.cn/avatar/e23b1834-049d-464e-8ffc-4b10093114d0.png",
        "CreateCompetition": False,
        "CreatedBy": "hicicada",
        "Description": "["root",{},["p",{},["span",{"data-type":"text"},["span",{"color":"rgb(101,
        109,
        118)","data-type":"leaf"},"OpenBMB (Open Lab for Big Model Base) aims to build foundation models and systems towards AGI."]]]]",
        "DisplayUrl": "",
        "Email": "",
        "FromSite": "",
        "FullName": "OpenBMB",
        "GithubAddress": "https://github.com/OpenBMB",
        "GmtCreated": "2023-03-21T07:55:04Z",
        "GmtModified": "2025-01-07T09:42:14Z",
        "Id": 63,
        "InitAdminMembers": "",
        "IsApply": False,
        "IsCertification": "",
        "Mobile": "",
        "Name": "OpenBMB",
        "Path": "",
        "Roles": None,
        "StarCnt": 0,
        "Status": 0,
        "SubscribeVo": None,
        "Type": 2
    },
    "PaiModelGalleryUrl": None,
    "PaiSdkParameter": None,
    "Path": "OpenBMB",
    "ProtectedMode": 2,
    "ReadMeContent": "\n# VoxCPM2\n\n**VoxCPM2** is a tokenizer-free, diffusion autoregressive Text-to-Speech model — **2B parameters**, **30 languages**, **48kHz** audio output, trained on over **2 million hours** of multilingual speech data.\n\n[![GitHub](https://img.shields.io/badge/GitHub-VoxCPM-blue?logo=github)](https://github.com/OpenBMB/VoxCPM)\n[![Docs](https://img.shields.io/badge/Docs-ReadTheDocs-8CA1AF)](https://voxcpm.readthedocs.io/en/latest/)\n[![Demo](https://img.shields.io/badge/Live%20Playground-Demo-orange)](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo)\n[![Audio Samples](https://img.shields.io/badge/Audio%20Samples-Demo%20Page-green)](https://openbmb.github.io/voxcpm2-demopage)\n[![Discord](https://img.shields.io/badge/Discord-VoxCPM-5865F2?logo=discord&logoColor=white)](https://discord.gg/KZUx7tVNwz)\n\n## Highlights\n\n- 🌍 **30-Language Multilingual** — No language tag needed; input text in any supported language directly\n- 🎨 **Voice Design** — Generate a novel voice from a natural-language description alone (gender, age, tone, emotion, pace…); no reference audio required\n- 🎛️ **Controllable Cloning** — Clone any voice from a short clip, with optional style guidance to steer emotion, pace, and expression while preserving timbre\n- 🎙️ **Ultimate Cloning** — Provide reference audio + its transcript for audio-continuation cloning; every vocal nuance faithfully reproduced\n- 🔊 **48kHz Studio-Quality Output** — Accepts 16kHz reference; outputs 48kHz via AudioVAE V2\"s built-in super-resolution, no external upsampler needed\n- 🧠 **Context-Aware Synthesis** — Automatically infers appropriate prosody and expressiveness from text content\n- ⚡ **Real-Time Streaming** — RTF as low as ~0.3 on NVIDIA RTX 4090, and ~0.13  accelerated by [Nano-VLLM](https://github.com/a710128/nanovllm-voxcpm)\n- 📜 **Fully Open-Source & Commercial-Ready** — Apache-2.0 license, free for commercial use\n\n国内用户欢迎访问官网体验:https://voxcpm.modelbest.cn/\n\n<summary><b>Supported Languages (30)</b></summary>\n\nArabic, Burmese, Chinese, Danish, Dutch, English, Finnish, French, German, Greek, Hebrew, Hindi, Indonesian, Italian, Japanese, Khmer, Korean, Lao, Malay, Norwegian, Polish, Portuguese, Russian, Spanish, Swahili, Swedish, Tagalog, Thai, Turkish, Vietnamese\n\nChinese Dialects: 四川话, 粤语, 吴语, 东北话, 河南话, 陕西话, 山东话, 天津话, 闽南话\n\n\n## Quick Start\n\n### Installation\n\n```bash\npip install voxcpm\n```\n\n**Requirements:** Python ≥ 3.10, PyTorch ≥ 2.5.0, CUDA ≥ 12.0 · [Full Quick Start →](https://voxcpm.readthedocs.io/en/latest/quickstart.html)\n\n### Text-to-Speech\n\n```python\nfrom voxcpm import VoxCPM\nimport soundfile as sf\n\nmodel = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False)\n\nwav = model.generate(\n    text="VoxCPM2 brings multilingual support, creative voice design, and controllable voice cloning.",\n    cfg_value=2.0,\n    inference_timesteps=10,\n)\nsf.write("output.wav", wav, model.tts_model.sample_rate)\n```\n\nIf you prefer downloading from ModelScope first, you can use:\n\n```bash\npip install modelscope\n```\n\n```python\nfrom modelscope import snapshot_download\nsnapshot_download("OpenBMB/VoxCPM2", local_dir=\"./pretrained_models/VoxCPM2\") # specify the local directory to save the model\n\nfrom voxcpm import VoxCPM\nimport soundfile as sf\nmodel = VoxCPM.from_pretrained("./pretrained_models/VoxCPM2", load_denoiser=False)\n\nwav = model.generate(\n    text="VoxCPM2 is the current recommended release for realistic multilingual speech synthesis.",\n    cfg_value=2.0,\n    inference_timesteps=10,\n)\nsf.write("demo.wav", wav, model.tts_model.sample_rate)\n```\n\n### Voice Design\n\nPut the voice description in parentheses at the start of `text`, followed by the content to synthesize:\n\n```python\nwav = model.generate(\n    text="(A young woman, gentle and sweet voice)Hello, welcome to VoxCPM2!",\n    cfg_value=2.0,\n    inference_timesteps=10,\n)\nsf.write("voice_design.wav", wav, model.tts_model.sample_rate)\n```\n\n### Controllable Voice Cloning\n\n```python\n# Basic cloning\nwav = model.generate(\n    text="This is a cloned voice generated by VoxCPM2.",\n    reference_wav_path="speaker.wav",\n)\nsf.write("clone.wav", wav, model.tts_model.sample_rate)\n\n# Cloning with style control\nwav = model.generate(\n    text="(slightly faster, cheerful tone)This is a cloned voice with style control.",\n    reference_wav_path="speaker.wav",\n    cfg_value=2.0,\n    inference_timesteps=10,\n)\nsf.write("controllable_clone.wav", wav, model.tts_model.sample_rate)\n```\n\n### Ultimate Cloning\n\nProvide both the reference audio and its exact transcript for maximum fidelity. Pass the same clip to both `reference_wav_path` and `prompt_wav_path` for highest similarity:\n\n```python\nwav = model.generate(\n    text="This is an ultimate cloning demonstration using VoxCPM2.",\n    prompt_wav_path="speaker_reference.wav",\n    prompt_text="The transcript of the reference audio.",\n    reference_wav_path="speaker_reference.wav",\n)\nsf.write("hifi_clone.wav", wav, model.tts_model.sample_rate)\n```\n\n### Streaming\n\n```python\nimport numpy as np\n\nchunks = []\nfor chunk in model.generate_streaming(text="Streaming is easy with VoxCPM!"):\n    chunks.append(chunk)\nwav = np.concatenate(chunks)\nsf.write("streaming.wav", wav, model.tts_model.sample_rate)\n```\n\n## Model Details\n\n| Property | Value |\n|---|---|\n| Architecture | Tokenizer-free Diffusion Autoregressive (LocEnc → TSLM → RALM → LocDiT) |\n| Backbone | Based on MiniCPM-4, totally 2B parameters |\n| Audio VAE | AudioVAE V2 (asymmetric encode/decode, 16kHz in → 48kHz out) |\n| Training Data | 2M+ hours multilingual speech |\n| LM Token Rate | 6.25 Hz |\n| Max Sequence Length | 8192 tokens |\n| dtype | bfloat16 |\n| VRAM | ~8 GB |\n| RTF (RTX 4090) | ~0.30 (standard) / ~0.13 (Nano-vLLM) |\n\n## Performance\n\nVoxCPM2 achieves state-of-the-art or competitive results on major zero-shot and controllable TTS benchmarks.\n\nSee the [GitHub repo](https://github.com/OpenBMB/VoxCPM#-performance) for full benchmark tables (Seed-TTS-eval, CV3-eval, InstructTTSEval, MiniMax Multilingual Test).\n\n## Fine-tuning\n\nVoxCPM2 supports both full SFT and LoRA fine-tuning with as little as 5–10 minutes of audio:\n\n```bash\n# LoRA fine-tuning (recommended)\npython scripts/train_voxcpm_finetune.py \\\n    --config_path conf/voxcpm_v2/voxcpm_finetune_lora.yaml\n\n# Full fine-tuning\npython scripts/train_voxcpm_finetune.py \\\n    --config_path conf/voxcpm_v2/voxcpm_finetune_all.yaml\n```\n\nSee the [Fine-tuning Guide](https://voxcpm.readthedocs.io/en/latest/finetuning/finetune.html) for full instructions.\n\n## Limitations\n\n- Voice Design and Style Control results may vary between runs; generating 1–3 times is recommended to obtain the desired output.\n- Performance varies across languages depending on training data availability.\n- Occasional instability may occur with very long or highly expressive inputs.\n- **Strictly forbidden** to use for impersonation, fraud, or disinformation. AI-generated content should be clearly labeled.\n\n## Citation\n\n```bibtex\n@article{voxcpm2_2026,\n  title   = {VoxCPM2: Tokenizer-Free TTS for Multilingual Speech Generation, Creative Voice Design, and True-to-Life Cloning},\n  author  = {VoxCPM Team},\n  journal = {GitHub},\n  year    = {2026},\n}\n\n@article{voxcpm2025,\n  title   = {VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning},\n  author  = {Zhou, Yixuan and Zeng, Guoyang and Liu, Xin and Li, Xiang and\n             Yu, Renjie and Wang, Ziyang and Ye, Runchuan and Sun, Weiyue and\n             Gui, Jiancheng and Li, Kehan and Wu, Zhiyong and Liu, Zhiyuan},\n  journal = {arXiv preprint arXiv:2509.24650},\n  year    = {2025},\n}\n```\n\n## License\n\nReleased under the [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) license, free for commercial use. For production deployments, we recommend thorough testing and safety evaluation tailored to your use case.\n\n",
    "ReadMeTips": None,
    "RelatedArxivId": [
        "2509.24650"
    ],
    "RelatedPaper": [
        192183
    ],
    "Revision": "master",
    "Stars": 56,
    "StorageSize": 4960729800,
    "Studios": [],
    "SubVisionFoundation": "",
    "SupportApiInference": False,
    "SupportDashDeployment": 0,
    "SupportDashInference": 0,
    "SupportDashTraining": 0,
    "SupportDeployment": 0,
    "SupportExperience": 0,
    "SupportFinetuning": 0,
    "SupportFlexTrain": 0,
    "SupportInference": "",
    "SupportPaiModelGallery": None,
    "SupportPaiSdk": 0,
    "SwingDeployInfo": None,
    "Tags": [
        "text-to-speech",
        "tts",
        "multilingual",
        "voice-cloning",
        "voice-design",
        "diffusion",
        "audio"
    ],
    "Tasks": [
        {
            "ChineseName": "语音合成",
            "Description": "",
            "DomainName": "audio",
            "Id": 32,
            "IsExhibition": True,
            "IsHot": 0,
            "IsLeaf": True,
            "IsLoginRequired": True,
            "IsRetrieval": True,
            "Level": 1,
            "Name": "text-to-speech",
            "ParentId": -1,
            "ParentTask": None,
            "Sorting": 0,
            "SupportWidgets": True,
            "TypicalModel": "",
            "WidgetConfig": "{"task": "text-to-speech", "inputs": [{"type": "text", "validator": {"max_words": 300}, "displayType": "OnlyTextArea"}], "output": {"displayType": "AudioPlayer", "transformOutputs": [{"fileType": "pcm", "outputKey": "output_pcm"}], "displayOutputMapping": "output_pcm"}, "examples": []}",
            "WidgetValidator": ""
        }
    ],
    "Tools": [],
    "TriggerWords": None,
    "Visibility": 5,
    "VisionFoundation": "",
    "_": None,
    "widgets": []
}