Local-AI-Stack_RX-6700-XT-ROCm-7.x./Fast-Kokoro-ONNX.py at main · o0LINNY0o/Local-AI-Stack_RX-6700-XT-ROCm-7.x. · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import io
import soundfile as sf
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from pydantic import BaseModel
from kokoro_onnx import Kokoro

# Initialize App and Model
app = FastAPI(title="Kokoro TTS OpenAI API")

# Load the model once on startup
# Ensure 'kokoro-v1.0.onnx' and 'voices-v1.0.bin' are in the same folder
print("Loading Kokoro model...")
kokoro = Kokoro("kokoro-v1.0.int8.onnx", "voices-v1.0.bin")
#kokoro = Kokoro("kokoro-v1.0.fp16.onnx", "voices-v1.0.bin")
#kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
print("Model loaded!")

# Define the request format expected by OpenWebUI (OpenAI compatible)
class OpenAISpeechRequest(BaseModel):
    model: str = "kokoro"
    input: str
    voice: str = "af_sarah" # Default voice
    speed: float = 1.0
    response_format: str = "mp3" # OpenWebUI sends this, but we will return WAV

@app.post("/v1/audio/speech")
async def generate_speech(request: OpenAISpeechRequest):
    try:
        # 1. Handle Voice Mapping
        # If OpenWebUI sends a default OpenAI voice name, map it to a Kokoro voice
        voice_map = {
            "alloy": "am_adam",
            "echo": "af_bella",
            "fable": "af_sarah",
            "onyx": "bm_lewis",
            "nova": "bf_emma",
            "shimmer": "af_nicole",
        }
        selected_voice = voice_map.get(request.voice, request.voice)

        # 2. Generate Audio
        # Note: 'lang' defaults to en-us in kokoro-onnx if not specified
        samples, sample_rate = kokoro.create(
            request.input,
            voice=selected_voice,
            speed=request.speed,
            lang="en-us"
        )

        # 3. Convert to WAV in-memory
        # We return WAV because it is faster than encoding MP3 in Python without ffmpeg
        buffer = io.BytesIO()
        sf.write(buffer, samples, sample_rate, format='WAV')
        buffer.seek(0)

        # 4. Return Audio
        return Response(content=buffer.read(), media_type="audio/wav")

    except Exception as e:
        print(f"Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    # Run on port 8880 (you can change this)
    uvicorn.run(app, host="0.0.0.0", port=8880)