1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| import time import torchaudio
from cosyvoice.utils.file_utils import load_wav import sys sys.path.append('third_party/Matcha-TTS') from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
cosyvoice = CosyVoice2('/mnt/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000) text = '你好。' spk_text = '希望你以后'
ttft_list = []
for run in range(10): start_time = time.time() first_token_received = False
for i, j in enumerate(cosyvoice.inference_zero_shot(text, spk_text, prompt_speech_16k, stream=False)): if not first_token_received: ttft = (time.time() - start_time) * 1000 print(f"Run {run+1} TTFT: {ttft:.2f} ms") ttft_list.append(ttft) first_token_received = True torchaudio.save(f'zero_shot_run{run+1}_{i}.wav', j['tts_speech'], cosyvoice.sample_rate)
average_ttft = sum(ttft_list) / len(ttft_list) print(f"\nAverage TTFT over 10 runs: {average_ttft:.2f} ms")
|