How to get the "timed script file" (sentence boundary) when using batch synthesis for avatar text-to-speech?

Question

I've tried to enable word or sentence bounday to get the "time-stamped script" when generating an avatar text-to-speech video.

Everything is running fine but the only outputs im getting are the video and the standard summary.json file

# Function to submit a batch synthesis job
def submit_synthesis(text, voice, character, style, output_container_url, video_crop):
    synthesis_id = str(uuid.uuid4())  # Generate a unique synthesis ID
    url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{synthesis_id}?api-version=2024-08-01'
    
    headers = {
        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY,
        'Content-Type': 'application/json'
    }
    # Payload with avatarConfig for video generation
    payload = {
        "inputKind": "PlainText",
        "synthesisConfig": {
            "voice": voice  # Voice chosen by the user
        },
        "inputs": [
            {
                "content": text
            }
        ],
        "avatarConfig": {
            "talkingAvatarCharacter": character,  # Character chosen by user
            "talkingAvatarStyle": style,  # Style chosen by user
            "videoFormat": "webm",  # Specify video format
            "videoCodec": "vp9",  # Codec for video
            "subtitleType": "soft_embedded",  # Subtitles option
            "backgroundColor": "transparent",  # Background for avatar
            "videoCrop": video_crop  # Set the appropriate video crop based on the user's selection
        },
        "properties": {
            "destinationContainerUrl": output_container_url,  # Blob storage URL with SAS token
            "timeToLiveInHours": 744,  # How long to keep the job
            "wordBoundaryEnabled": True
        }
    }
    response = requests.put(url, headers=headers, json=payload)
    
    if response.status_code == 201:
        logger.info('Batch avatar synthesis job submitted successfully')
        job_id = response.json()["id"]
        return job_id
    else:
        logger.error(f'Failed to submit batch avatar synthesis job: {response.text}')
        return None
# Function to check the synthesis job status
def get_synthesis_status(job_id):
    url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{job_id}?api-version=2024-08-01'
    headers = {
        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # Check and print the full response for debugging purposes
        job_status = response.json()
        logger.info(f'Job status response: {job_status}')
        return job_status
    else:
        logger.error(f'Failed to get batch synthesis job status: {response.text}')
        return None
# Streamlit interface
def main():
    st.title("Azure Talking Avatar Video Generator")
    
    # Language and voice selection
    lang = st.selectbox('Choose the language', list(lang_voices.keys()))
    voice = lang_voices[lang][1]
    
    # User input for text
    text_input = st.text_area(f'Type the text in {lang}:')
    # Character selection
    selected_character = st.selectbox('Choose the avatar character:', list(character_styles.keys()))
    
    # Dynamically update styles based on character selection
    available_styles = character_styles[selected_character]
    selected_style = st.selectbox('Choose the avatar style:', available_styles)
    
    output_container_url = st.text_input("Enter Output Container URL (with SAS Token)")
    # Add an option to select avatar view (Full Body, Top Half, Head)
    avatar_view = st.selectbox('Choose avatar view', ["Full Body", "Top Half", "Head"], index=0)
    # Adjust video cropping based on avatar view selection
    if avatar_view == "Full Body":
        video_crop = {
            "topLeft": {"x": 0, "y": 0},
            "bottomRight": {"x": 1920, "y": 1080}
        }
    elif avatar_view == "Top Half":
        video_crop = {
            "topLeft": {"x": 0, "y": 0},
            "bottomRight": {"x": 1920, "y": 540}
        }
    else:  # Head
        video_crop = {
            "topLeft": {"x": 700, "y": 0},
            "bottomRight": {"x": 1220, "y": 500}
        }
    submit_button = st.button("Submit Job")
    
    if submit_button:
        if text_input.strip() and output_container_url.strip():
            # Check if selected style is valid for the character
            if selected_style in character_styles[selected_character]:
                with st.spinner("Processing..."):
                    job_id = submit_synthesis(text_input, voice, selected_character, selected_style, output_container_url, video_crop)
                    if job_id:
                        st.success(f'Job submitted! Job ID: {job_id}')
                        st.info("Checking job status. Please wait...")
                        while True:
                            job_status = get_synthesis_status(job_id)
                            
                            # Check for a valid response and process it
                            if job_status and job_status["status"] == "Succeeded":
                                st.success("Avatar video generated successfully!")
                                
                                # Log the full response to ensure correct output
                                logger.info(f'Full job status response: {job_status}')
                                
                                # Extract video URL from the response
                                destination_container_url = job_status['properties'].get('destinationContainerUrl', '')
                                relative_video_path = job_status['outputs'].get('result', '')
                                # Construct the full video URL
                                if destination_container_url and relative_video_path:
                                    # Ensure no double slashes in the final URL
                                    if destination_container_url.endswith('/'):
                                        destination_container_url = destination_container_url[:-1]
                                    
                                    video_url = f"{destination_container_url}/{relative_video_path}"
                                    
                                    # If the video URL is valid, display the video
                                    if video_url.startswith("http"):
                                        st.markdown(f"Download the avatar video [here]({video_url})")
                                        st.video(video_url)
                                    else:
                                        st.error("Video URL seems invalid or incomplete.")
                                else:
                                    st.error("Video URL seems invalid or incomplete.")
                                break
                            
                            elif job_status and job_status["status"] == "Failed":
                                st.error("Failed to generate the avatar video.")
                                break
                            else:
                                time.sleep(5)  # Wait for a few seconds before checking again
            else:
                # Show a warning if the style is not compatible with the selected character
                st.warning(f'The character "{selected_character}" cannot use the chosen style. '
                        f'Please choose from the available styles: {", ".join(available_styles)}')
        else:
            st.error("Please provide the text and output container URL.")

When i say I'm not getting the timed script file I'm refering to the [nnnn].word.json file, which I saw in the batch synthesis dumentation:
""" If sentence boundary data was requested ("sentenceBoundaryEnabled": true), then a corresponding [nnnn].sentence.json file is included in the results. Likewise, if word boundary data was requested ("wordBoundaryEnabled": true), then a corresponding [nnnn].word.json file is included in the results.

Here's an example word data file with both audio offset and duration in milliseconds:

[   {     "Text": "The",     "AudioOffset": 50,     "Duration": 137   },   {     "Text": "rainbow",     "AudioOffset": 200,     "Duration": 350   },   {     "Text": "has",     "AudioOffset": 562,     "Duration": 175   },   {     "Text": "seven",     "AudioOffset": 750,     "Duration": 300   },   {     "Text": "colors",     "AudioOffset": 1062,     "Duration": 625   },   {     "Text": ".",     "AudioOffset": 1700,     "Duration": 100   } ]

""" which I found here: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-synthesis#batch-synthesis-results

Answer

@d m I think you are referring to documentation of batch text to speech which generates this output. That is the endpoint should be something like below:

https://YourSpeechRegion.api.cognitive.microsoft.com/texttospeech/batchsyntheses/YourSynthesisId?api-version=2024-04-01

But the above code is generating batch for avatar text to speech. This is documented here.

That is, the endpoint is:

https://YourSpeechRegion.api.cognitive.microsoft.com/avatar/batchsyntheses/{job_id}?api-version=2024-08-01

I do not see a property that can be set in this case to generate similar files. The properties available for avatar batch jobs are here and I think you are using some of them in your jobs already. Based on the above, I think avatar TTS batch jobs do not have the option to generate these files.

If this answers your query, do click Accept Answer and Yes for was this answer helpful. And, if you have any further query do let us know.

Share via

How to get the "timed script file" (sentence boundary) when using batch synthesis for avatar text-to-speech?

1 answer

Your answer