Chatterbox Turbo TTS with MLX-Audio

In my last post, I tried Qwen3-TTS... This time, I test Chatterbox Turbo by Resemble.ai, which is an open-source, MIT licensed, text-to-speech model with zero-shot cloning.

Prior to this, I did not think of testing out Chatterbox Turbo because I thought it was voice cloning model that mandated a reference voice file. But it seems the reference file is optional, without which a default female voice with an American english accent is generated. It is far faster and easier to use than Qwen3-TTS or Kokoro, but really shines with a reference voice.

Here is a bit of code to test it...

As usual, disclaimers apply - don’t use this code.

Requires that a newer version of Python is installed via Brew, and MLX-Audio and epub2text are installed in a virtual environment, as described previously. Finally, requires MLX-format Chatterbox TTS model files to be downloaded into model directory - I am using the fp16 version.

# chatter.py - Use Chatterbox Turbo TTS to create audio files from .epub chapters
# (c) C.Y., myByways.com
#  v0.2 12 Feb 26

import os, sys, time
import mlx.core as mx
from epub2text import EPUBParser
os.environ['TQDM_DISABLE'] = '1'
from mlx_audio.tts.utils import load_model
from mlx_audio.audio_io import write as audio_write
from mlx_audio.tts.audio_player import AudioPlayer

CHATTERBOX_MODEL = 'models/chatterbox-turbo-fp16'
REFERENCE_AUDIO = None #'voices/c3po.wav'
OUTPUT_FORMAT = '.flac'
SAVE_CHAPTER = False

#https://stackoverflow.com/questions/6405208/how-to-convert-numeric-string-ranges-to-a-list-in-python
def stringrange_to_list(text):
    return sum(((list(range(*[int(j) + k for k,j in enumerate(i.split('-'))]))
        if '-' in i else [int(i)]) for i in text.split(',')), [])

def start_timer():
    return time.perf_counter()

def stop_timer(tic):
    toc = time.perf_counter()
    tic = toc - tic
    min, sec = divmod(tic, 60)
    return tic, int(min), int(sec)

def list_epub_chapters(epub_file):
    epub = EPUBParser(epub_file)
    metadata = epub.get_metadata()
    print(f'''Book: {epub_file}
Title: {metadata.title}
Author(s): {", ".join(metadata.authors)}''')
    for c, chapter in enumerate(epub.get_chapters()):
        print(f' {c+1:>3}. {chapter.title}: {chapter.char_count:,} characters')

def play_tts(text):
    model = load_model(CHATTERBOX_MODEL)
    results = model.generate(text = text, ref_audio = REFERENCE_AUDIO, stream = True)
    player = AudioPlayer(sample_rate = model.sample_rate)
    for i, result in enumerate(results):
        player.queue_audio(result.audio)
    player.wait_for_drain()
    player.stop()

def save_epub_tts(epub_file, chapter_range):
    epub = EPUBParser(epub_file)
    chapters = epub.get_chapters()
    chapter_range = stringrange_to_list(chapter_range)
    output_dir = os.path.splitext(os.path.basename(epub_file))[0]
    os.makedirs(output_dir, exist_ok = True)
    model = load_model(CHATTERBOX_MODEL)
    for i, c in enumerate(chapter_range):
        chapter = chapters[c-1]
        print(f'Extract {c}. {chapter.title}: {chapter.char_count:,} characters')
        if SAVE_CHAPTER:
            with open(os.path.join(output_dir, f'{c:03}.txt'), 'w') as file:
                file.write(chapter.text)
        tic = start_timer()
        audio = []
        results = model.generate(text = chapter.text, ref_audio = REFERENCE_AUDIO, temperature=0.5, repetition_penalty=1.5)
        for i, result in enumerate(results):
            audio.append(result.audio)
        audio_write(os.path.join(output_dir, f'{c:03}{OUTPUT_FORMAT}'),
            mx.concatenate(audio, axis=0), model.sample_rate)
        tic, min, sec = stop_timer(tic)
        print(f' Completed {c:03}{OUTPUT_FORMAT} in '
            f'{min:,} minutes {sec} seconds ({int(60*chapter.char_count/tic):,} cpm)')
        #os.system(f'say "{i} complete in {int(m)} minutes"')

match len(sys.argv):
    case 2:
        epub_file = sys.argv[1]
        if os.path.isfile(epub_file):
            if epub_file.lower().endswith('.epub'):
                list_epub_chapters(epub_file)
            else:
                with open(epub_file, 'r') as file:
                    play_tts(file.read())
        else: 
            play_tts(epub_file)
    case 3:
        epub_file = sys.argv[1]
        chapter_range = sys.argv[2]
        if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):
            tic = start_timer()
            save_epub_tts(epub_file, chapter_range)
            tic, min, sec = stop_timer(tic)
            print(f'Completed in {min:,} minutes {sec} seconds')
    case _:
        cmd = sys.argv[0]
        print(f'''{cmd} text|file - play TTS stream on given text or text in file
{cmd} epub_file - list .epub chapters (file must exist)
{cmd} epub_file chapter_range - perform TTS on given chapters in .epub''')

The code above is a bit clearner than the previous version:

running python chatter.py "testing 1-2-3" will play the given string (or contents of a text file).
running python chatter.py book.epub will list the chapters of an .epub book.
running python chatter.py book.epub 1,4-6 will save chapters 1, 4, 5 and 6 as .flac files in a new directory following the .epub filename.

Note the code does not assume the presence of a reference voice file.

The easiest way to test with a reference voice, is to use one of the sample voice files at Resemble.ai’s Github demo page - right-click the audio control under any Prompt (or even an Output), and select “Save Audio as...” or “Download Audio as...” (Firefox and Safari respectively) to save as a .wav file.

Chuck it in a voices directory. Then edit the global variable in the code above, such that REFERENCE_AUDIO points to the saved file, e.g.

REFERENCE_AUDIO = 'voices/c3po.wav'

Update 15 Feb 26: Updated code... to go with part 2, generating a .m4b audiobook with metadata and chapter markers.

❮ Older

Newer ❯