In my last post, I tried Qwen3-TTS... This time, I test Chatterbox Turbo by Resemble.ai, which is an open-source, MIT licensed, text-to-speech model with zero-shot cloning.
Prior to this, I did not think of testing out Chatterbox Turbo because I thought it was voice cloning model that mandated a reference voice file. But it seems the reference file is optional, without which a default female voice with an American english accent is generated. It is far faster and easier to use than Qwen3-TTS or Kokoro, but really shines with a reference voice.
Here is a bit of code to test it...
As usual, disclaimers apply - don’t use this code.
Requires that a newer version of Python is installed via Brew, and MLX-Audio and epub2text are installed in a virtual environment, as described previously. Finally, requires MLX-format Chatterbox TTS model files to be downloaded into model directory - I am using the fp16 version.
# chatter.py - Use Chatterbox Turbo TTS to create audio files from .epub chapters
# (c) C.Y., myByways.com
# v0.2 12 Feb 26
import os, sys, time
import mlx.core as mx
from epub2text import EPUBParser
os.environ['TQDM_DISABLE'] = '1'
from mlx_audio.tts.utils import load_model
from mlx_audio.audio_io import write as audio_write
from mlx_audio.tts.audio_player import AudioPlayer
CHATTERBOX_MODEL = 'models/chatterbox-turbo-fp16'
REFERENCE_AUDIO = None #'voices/c3po.wav'
OUTPUT_FORMAT = '.flac'
SAVE_CHAPTER = False
#https://stackoverflow.com/questions/6405208/how-to-convert-numeric-string-ranges-to-a-list-in-python
def stringrange_to_list(text):
return sum(((list(range(*[int(j) + k for k,j in enumerate(i.split('-'))]))
if '-' in i else [int(i)]) for i in text.split(',')), [])
def start_timer():
return time.perf_counter()
def stop_timer(tic):
toc = time.perf_counter()
tic = toc - tic
min, sec = divmod(tic, 60)
return tic, int(min), int(sec)
def list_epub_chapters(epub_file):
epub = EPUBParser(epub_file)
metadata = epub.get_metadata()
print(f'''Book: {epub_file}
Title: {metadata.title}
Author(s): {", ".join(metadata.authors)}''')
for c, chapter in enumerate(epub.get_chapters()):
print(f' {c+1:>3}. {chapter.title}: {chapter.char_count:,} characters')
def play_tts(text):
model = load_model(CHATTERBOX_MODEL)
results = model.generate(text = text, ref_audio = REFERENCE_AUDIO, stream = True)
player = AudioPlayer(sample_rate = model.sample_rate)
for i, result in enumerate(results):
player.queue_audio(result.audio)
player.wait_for_drain()
player.stop()
def save_epub_tts(epub_file, chapter_range):
epub = EPUBParser(epub_file)
chapters = epub.get_chapters()
chapter_range = stringrange_to_list(chapter_range)
output_dir = os.path.splitext(os.path.basename(epub_file))[0]
os.makedirs(output_dir, exist_ok = True)
model = load_model(CHATTERBOX_MODEL)
for i, c in enumerate(chapter_range):
chapter = chapters[c-1]
print(f'Extract {c}. {chapter.title}: {chapter.char_count:,} characters')
if SAVE_CHAPTER:
with open(os.path.join(output_dir, f'{c:03}.txt'), 'w') as file:
file.write(chapter.text)
tic = start_timer()
audio = []
results = model.generate(text = chapter.text, ref_audio = REFERENCE_AUDIO, temperature=0.5, repetition_penalty=1.5)
for i, result in enumerate(results):
audio.append(result.audio)
audio_write(os.path.join(output_dir, f'{c:03}{OUTPUT_FORMAT}'),
mx.concatenate(audio, axis=0), model.sample_rate)
tic, min, sec = stop_timer(tic)
print(f' Completed {c:03}{OUTPUT_FORMAT} in '
f'{min:,} minutes {sec} seconds ({int(60*chapter.char_count/tic):,} cpm)')
#os.system(f'say "{i} complete in {int(m)} minutes"')
match len(sys.argv):
case 2:
epub_file = sys.argv[1]
if os.path.isfile(epub_file):
if epub_file.lower().endswith('.epub'):
list_epub_chapters(epub_file)
else:
with open(epub_file, 'r') as file:
play_tts(file.read())
else:
play_tts(epub_file)
case 3:
epub_file = sys.argv[1]
chapter_range = sys.argv[2]
if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):
tic = start_timer()
save_epub_tts(epub_file, chapter_range)
tic, min, sec = stop_timer(tic)
print(f'Completed in {min:,} minutes {sec} seconds')
case _:
cmd = sys.argv[0]
print(f'''{cmd} text|file - play TTS stream on given text or text in file
{cmd} epub_file - list .epub chapters (file must exist)
{cmd} epub_file chapter_range - perform TTS on given chapters in .epub''')
The code above is a bit clearner than the previous version:
- running
python chatter.py "testing 1-2-3"will play the given string (or contents of a text file). - running
python chatter.py book.epubwill list the chapters of an.epubbook. - running
python chatter.py book.epub 1,4-6will save chapters 1, 4, 5 and 6 as.flacfiles in a new directory following the.epubfilename.
Note the code does not assume the presence of a reference voice file.
The easiest way to test with a reference voice, is to use one of the sample voice files at Resemble.ai’s Github demo page - right-click the audio control under any Prompt (or even an Output), and select “Save Audio as...” or “Download Audio as...” (Firefox and Safari respectively) to save as a .wav file.
Chuck it in a voices directory. Then edit the global variable in the code above, such that REFERENCE_AUDIO points to the saved file, e.g.
REFERENCE_AUDIO = 'voices/c3po.wav'
Update 15 Feb 26: Updated code... to go with part 2, generating a .m4b audiobook with metadata and chapter markers.