Elevenlabs API

Elevenlabs API

Synchronized Text Highlighting with ElevenLabs Speech in Laravel/PHP Brandon Demeria

https://github.com/TasikBeyond/Echo
- https://developer.mozilla.org/en-US/docs/Web/API/HTMLMediaElement/timeupdate_event
  - requestAnimationFrame video/audio 要素の timeupdate イベントを高頻度にする
wss endpoit
```
wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?model_id={model_id}
```
- dmc-websockets-pusher
  https://github.com/dmccuskey/dmc-websockets
  examples/dmc_websockets/dmc-websockets-pusher/main.lua
  - Websocket client and server (ws/wss) for Solar2d
    - Corona Simulator (Windows) todo
      - https://github.com/joehinkle11/SolarWebSockets/issues/2
        https://discord.com/channels/721785436195782677/724637134115438714/951870823654895627
        Specially if you are working with large messages (which was our case). A good alternative would be the websockets plugin from @joe95 which I think works great
      - [WebSockets library written in C] (https://github.com/vrtql/websockets)
        https://www.reddit.com/r/C_Programming/comments/14wilir/new_websockets_client_c_library/
      - All C++ WebSocket Client Server
        About C++ websocket client/server library
      - https://forums.solar2d.com/t/ann-dmc-websockets-websocket-module-for-corona-sdk/325534/54
      - Rename plugin.opensslv3, support for OpenSSL 3.0.8
    - https://forums.solar2d.com/t/i-created-a-websocket-server-and-client-plug-in/351648/18
    - https://roaminggamer.github.io/RGDocs/pages/SSK2/external/#autolan-my-developers
      - Run Length Encoding/Decoding
      - AutoLan etc

payload

"normalizedAlignment": {
  "char_start_times_ms": [0, 3, 7, 9, 11, 12, 13, 15, 17, 19, 21],
  "chars_durations_ms": [3, 4, 2, 2, 1, 1, 2, 2, 2, 2, 3],
  "chars": ["H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"]
},

php

https://github.com/Textalk/websocket-php

private function setupClient(): Client {
      // NOTE: Replace voice_id and module_id with the voice and module of your choice from ElevenLabs
      return new Client("wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?model_id={model_id}", $this->steamOptions());
  }

  private function steamOptions(): array
  {
      return [
          'context' => stream_context_create([
              'ssl' => [
                  'verify_peer' => false,
                  'verify_peer_name' => false,
              ],
          ]),
          'timeout' => 300,
      ];
  }

// Note: You must set include your ElevenLabs API key here.
private function streamMessage(string $text): string {
    return json_encode([
        "text" => $text,
        "voice_settings" => [
            "stability" => 0.5,
            "similarity_boost" => true
        ],
        "try_trigger_generation" => true,
        "xi_api_key" => env('ELEVEN_LABS_API_KEY'),
    ]);
}

// Note: It is required that you include an end of stream message with the request.
private function streamEndOfStreamMessage(): string {
    return json_encode(["text" => ""]);
}

function textToSpeech(string $text): AudioDataDTO
{
    $client = $this->setupClient();
    $client->text($this->streamMessage($text));
    $client->text($this->streamEndOfStreamMessage());
    // Note: We'll extend this function in the next step.
}

parser

function textToSpeech(string $text): AudioDataDTO
{
    $client = $this->setupClient();
    $client->text($this->streamMessage($text));
    $client->text($this->streamEndOfStreamMessage());

    $audioData = null;
    $startTimes = [];
    $characters = [];
    $durations = [];
    $startTimeOffsets = [];
    $offset = 0;

    // From the stream of data, we will receive audio chunks and alignment data.
    // We will use the alignment data to calculate the start times for each word.
    while ($client->isConnected()) {
        $response = $client->receive();
        $data = json_decode($response, true);

        if (!empty($data["audio"])) {
            // Capture the Audio Data
            $chunk = base64_decode($data["audio"]);
            $audioData .= $chunk;

            // Append "start times" for each character to an array.
            if (isset($data['normalizedAlignment']['charStartTimesMs'])) {
                // Merge existing start times with new start times from the current response
                $startTimes = array_merge($startTimes, $data['normalizedAlignment']['charStartTimesMs']);
                // Count the number of start times in the current response
                $amount = count($data['normalizedAlignment']['charStartTimesMs']);
                // Create an array filled with the current offset, repeated for each start time
                $offsetArray = array_fill(0, $amount, $offset);
                // Merge existing start time offsets with the newly created offset array
                $startTimeOffsets = array_merge($startTimeOffsets, $offsetArray);
            }
            // Merge all characters to array
            if (isset($data['normalizedAlignment']['chars'])) {
                $characters = array_merge($characters, $data['normalizedAlignment']['chars']);
            }
            // Merge all character durations to an array.
            if (isset($data['normalizedAlignment']['charDurationsMs'])) {
                $durations = array_merge($durations, $data['normalizedAlignment']['charDurationsMs']);
            }
            // Update the offset by adding the last start time and the last duration from the arrays.
            // This ensures that for each new audio chunk, we correctly calculate the start times
            // by accounting for the total elapsed time (offset) from previous chunks.
            if (isset($data['normalizedAlignment'])) {
                $offset += end($startTimes) + end($durations);
            }
        } else {
            break;
        }
    }

    // The client connected may close automatically or fail.
    try {
        $client->close();
    } catch (\Exception $e) {
        Log::error('Error closing websocket client: ' . $e->getMessage());
    }

    // Explained in the next step.
    $timestamps = $this->calculateWordTimestamps($startTimes, $startTimeOffsets, $characters);
    return new AudioDataDto($text, $audioData, $timestamps);
}

word timestamps

private function calculateWordTimestamps(array $startTimes, array $startTimeOffsets, array $characters): array {
  $words = [];
  $wordStartTimes = [];
  $currentWord = '';
  $currentWordStartTime = null;

  foreach ($characters as $index => $char) {
      if ($currentWordStartTime === null) {
          $currentWordStartTime = $startTimes[$index] + $startTimeOffsets[$index];
      }

      // Append character to the current word.
      $currentWord .= $char;

      // Check if the character is a space or if we are at the end of the array.
      if ($char == ' ' || $index == count($characters) - 1) {
          if (trim($currentWord) != '') { // Do not append spaces as words.
              $words[] = trim($currentWord);
              $wordStartTimes[] = $currentWordStartTime;
          }

          // Reset for the next word.
          $currentWord = '';
          $currentWordStartTime = null;
      }
  }

  return [
      'words' => $words,
      'start_times' => $wordStartTimes,
  ];
}

Data Transfer Object

class AudioDataDto
{
    public string $text;
    public string $audioData;
    public array $timestamps;

    public function __construct(string $text, $audioData, array $timestamps)
    {
        $this->text = $text;
        $this->audioData = $audioData;
        $this->timestamps = $timestamps;
    }
}

save it to S3 with .mp3

public function uploadAudioData(AudioDataDto $audioDataDto): Audio
  {
      $randomUuid = Str::UUID();
      $audioPath = 'audio/' . $randomUuid . ".mp3";
      $uploaded = Storage::disk('s3')->put($audioPath, $audioDataDto->audioData);

      if ($uploaded) {
          return Audio::create([
              Audio::KEY_ID => $randomUuid,
              Audio::KEY_TEXT => $audioDataDto->text,
              Audio::KEY_FILE_PATH => $audioPath,
              Audio::KEY_WORD_TIMESTAMPS => $audioDataDto->timestamps,
          ]);
      }

      abort(500, "Failed to upload audio data.");
  }

retrieve it

public function getAudioData(string $audioId): Audio
{
    return Audio::where(Audio::KEY_ID, $audioId)->first();
}

public function getAudioFile(string $audioId) {
    $audio = Audio::where(Audio::KEY_ID, $audioId)->first();
    $filePath = $audio->file_path;

    if (Storage::disk('s3')->exists($filePath)) {
        $file = Storage::disk('s3')->get($filePath);
        $type = Storage::disk('s3')->mimeType($filePath);
        return [
            'file' => $file,
            'type' => $type,
        ];
    }

    abort(404, "Audio not found.");
}

Audio model (endpoint)

<?php

namespace App\Models;

use App\Traits\PrimaryUuidTrait;
use Illuminate\Database\Eloquent\Model;

class Audio extends Model
{
    const TABLE_NAME = "audio";

    const KEY_ID = "id"; // NOTE: UUID for the id
    const KEY_TEXT = "text";
    const KEY_FILE_PATH = "file_path";
    const KEY_WORD_TIMESTAMPS = "word_timestamps";
    const KEY_CREATED_AT = "created_at";
    const KEY_UPDATED_AT = "updated_at";

    const APPENDS_URL = "url";

    public $appends = [
        self::APPENDS_URL,
    ];

    protected $table = self::TABLE_NAME;

    public $preventsLazyLoading = true;

    protected $fillable = [
        self::KEY_ID,
        self::KEY_TEXT,
        self::KEY_FILE_PATH,
        self::KEY_WORD_TIMESTAMPS,
        self::KEY_CREATED_AT,
        self::KEY_UPDATED_AT,
    ];

    protected $casts = [
        self::KEY_WORD_TIMESTAMPS => 'json',
    ];

    /**
     * The url is for client devices to access the voice audio file.
     * This does not use the files' storage path as storage path is not accessible to client devices.
     * @return string
     */
    public function getUrlAttribute(): string
    {
        return env('APP_URL') . '/api/audio/' . $this->id . ".mp3";
    }
}

Response object

<?php

namespace App\Responses;

use App\Models\Audio;
use JsonSerializable;

class AudioResponse implements JsonSerializable {

    /** @var string $id */
    public string $id;

    /** @var string $url */
    public string $url;

    /** @var string $text */
    public string $text;

    /** @var array $timestamps */
    public array $timestamps;

    public function __construct(Audio $audio)
    {
        $this->id = $audio->id;
        $this->url = $audio->url;
        $this->text = $audio->text;
        $this->timestamps = $audio->word_timestamps;
    }

    public function jsonSerialize(): array
    {
        return [
            "id" => $this->id,
            "url" => $this->url,
            "text" => $this->text,
            "timestamps" => $this->timestamps,
        ];
    }
}

{
"id": "5b492e71-3c40-4480-a006-bdba0572ba0d",
"url": "http://localhost/api/audio/5b492e71-3c40-4480-a006-bdba0572ba0d.mp3",
"text": "Welcome brave travellers.",
"timestamps": {
  "words": ["Welcome", "brave", "travellers."],
  "start_times": [46, 441, 685]
}
}

Vue SynchronizedTextBox

<script setup>

import {computed, onMounted, reactive, ref, watch, watchEffect} from "vue";

const props = defineProps({
    audioData: {
        type: Object,
        default: [],
    },
});

let audio = null;
const data = reactive({
    audioTimestamp: 0,
    isPlaying: false,
})

function pauseAudio() {
    if (audio) {
        audio.pause();
        audio.currentTime = 0;
        data.isPlaying = false;
    }
}

function playAudioClicked(url, message) {
    playAudio(url, message);
}

function playAudio(url, audioData) {
    if (audio) {
        audio.pause();
        audio.currentTime = 0;
    }

    audio = new Audio(url);
    audio.playbackRate = 1.0;

    // Estimate the duration of the audio because the duration is not always available immediately.
    let words = audioData.text.split(' ').length;
    let estimatedDuration = words / 3; // 3 words per second

    data.isPlaying = false
    audio.play().then(function() {
        data.isPlaying = true;
    }).catch(function(error) {
        data.isPlaying = false;
    });

    // Sets data.audioTimestamp so we can reference the current time elsewhere.
    audio.addEventListener('timeupdate', function() {
        data.audioTimestamp = audio.currentTime;
        let actualDuration = isFinite(audio.duration) ? audio.duration : estimatedDuration;
        let percentagePlayed = (audio.currentTime / actualDuration) * 100;
        if (percentagePlayed >= 100) {
            data.isPlaying = false;
        }
    }, false);
}

// Determines if the word index should be highlighted based on current time from our audio clip.
function shouldHighlight(audioData, index) {
    if (!data.isPlaying) {
        return false;
    }

    const currentWordTime = audioData.timestamps?.start_times[index] ?? null;

    if (currentWordTime !== null) {
        const adjustedCurrentWordTime = Math.max(currentWordTime - 100, 0);
        const audioTimeInMs = data.audioTimestamp * 1000;
        if ((audioTimeInMs) > adjustedCurrentWordTime) {
            return true;
        }
    }

    return false
}

</script>

<template>
    <div class="bg-gray-900 py-0 wrapper text-white overflow-hidden">
        <div class="relative px-8 py-4 min-h-[56px] bg-gray-700">
            <span v-for="(word, index) in props.audioData.text.split(' ')" :key="index" :class="{ 'text-gray-500': shouldHighlight(props.audioData, index) && data.isPlaying }">
                {{ word }}
                <span v-if="index < props.audioData.text.split(' ').length - 1"> </span>
            </span>
            <span v-if="!data.isPlaying" class="absolute top-2 right-3 text-white cursor-pointer text-lg" @click="playAudioClicked(props.audioData.url, props.audioData)">
                ▶️
            </span>
            <span v-else class="absolute top-2 right-3 text-white cursor-pointer text-lg" @click="pauseAudio()">
                ⏸️
            </span>
        </div>
    </div>
</template>

<style scoped>
@keyframes fade-in-color {
    100% {
        color: inherit;
    }
    0% {
        color: #6b7280;
    }
}

.text-gray-500 {
    animation: fade-in-color 2s ease-in forwards;
}
</style>