phoneme_example.py 2.23 KB
Newer Older
guobj's avatar
init  
guobj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from kokoro import KPipeline, KModel
import torch
from scipy.io import wavfile

def save_audio(audio: torch.Tensor, filename: str):
    """Helper function to save audio tensor as WAV file"""
    if audio is not None:
        # Ensure audio is on CPU and in the right format
        audio_cpu = audio.cpu().numpy()
        
        # Save using scipy.io.wavfile
        wavfile.write(
            filename,
            24000,  # Kokoro uses 24kHz sample rate
            audio_cpu
        )
        print(f"Audio saved as '{filename}'")
    else:
        print("No audio was generated")

def main():
    # Initialize pipeline with American English
    pipeline = KPipeline(lang_code='a')
    
    # The phoneme string for:
    # "How are you today? I am doing reasonably well, thank you for asking"
    phonemes = "hˌW ɑɹ ju tədˈA? ˌI ɐm dˈuɪŋ ɹˈizənəbli wˈɛl, θˈæŋk ju fɔɹ ˈæskɪŋ"
    
    try:
        print("\nExample 1: Using generate_from_tokens with raw phonemes")
        results = list(pipeline.generate_from_tokens(
            tokens=phonemes,
            voice="af_bella",
            speed=1.0
        ))
        if results:
            save_audio(results[0].audio, 'phoneme_output_new.wav')
        
        # Example 2: Using generate_from_tokens with pre-processed tokens
        print("\nExample 2: Using generate_from_tokens with pre-processed tokens")
        #  get the tokens through G2P or any other method
        text = "How are you today? I am doing reasonably well, thank you for asking"
        _, tokens = pipeline.g2p(text)
        
        # Then generate from tokens
        for result in pipeline.generate_from_tokens(
            tokens=tokens,
            voice="af_bella",
            speed=1.0
        ):
            # Each result may contain timestamps if available
            if result.tokens:
                for token in result.tokens:
                    if hasattr(token, 'start_ts') and hasattr(token, 'end_ts'):
                        print(f"Token: {token.text} ({token.start_ts:.2f}s - {token.end_ts:.2f}s)")
            save_audio(result.audio, f'token_output_{hash(result.phonemes)}.wav')
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()