I've tried to enable word or sentence bounday to get the "time-stamped script" when generating an avatar text-to-speech video.
Everything is running fine but the only outputs im getting are the video and the standard summary.json file
# Function to submit a batch synthesis job
def submit_synthesis(text, voice, character, style, output_container_url, video_crop):
synthesis_id = str(uuid.uuid4()) # Generate a unique synthesis ID
url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{synthesis_id}?api-version=2024-08-01'
headers = {
'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY,
'Content-Type': 'application/json'
}
# Payload with avatarConfig for video generation
payload = {
"inputKind": "PlainText",
"synthesisConfig": {
"voice": voice # Voice chosen by the user
},
"inputs": [
{
"content": text
}
],
"avatarConfig": {
"talkingAvatarCharacter": character, # Character chosen by user
"talkingAvatarStyle": style, # Style chosen by user
"videoFormat": "webm", # Specify video format
"videoCodec": "vp9", # Codec for video
"subtitleType": "soft_embedded", # Subtitles option
"backgroundColor": "transparent", # Background for avatar
"videoCrop": video_crop # Set the appropriate video crop based on the user's selection
},
"properties": {
"destinationContainerUrl": output_container_url, # Blob storage URL with SAS token
"timeToLiveInHours": 744, # How long to keep the job
"wordBoundaryEnabled": True
}
}
response = requests.put(url, headers=headers, json=payload)
if response.status_code == 201:
logger.info('Batch avatar synthesis job submitted successfully')
job_id = response.json()["id"]
return job_id
else:
logger.error(f'Failed to submit batch avatar synthesis job: {response.text}')
return None
# Function to check the synthesis job status
def get_synthesis_status(job_id):
url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{job_id}?api-version=2024-08-01'
headers = {
'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
# Check and print the full response for debugging purposes
job_status = response.json()
logger.info(f'Job status response: {job_status}')
return job_status
else:
logger.error(f'Failed to get batch synthesis job status: {response.text}')
return None
# Streamlit interface
def main():
st.title("Azure Talking Avatar Video Generator")
# Language and voice selection
lang = st.selectbox('Choose the language', list(lang_voices.keys()))
voice = lang_voices[lang][1]
# User input for text
text_input = st.text_area(f'Type the text in {lang}:')
# Character selection
selected_character = st.selectbox('Choose the avatar character:', list(character_styles.keys()))
# Dynamically update styles based on character selection
available_styles = character_styles[selected_character]
selected_style = st.selectbox('Choose the avatar style:', available_styles)
output_container_url = st.text_input("Enter Output Container URL (with SAS Token)")
# Add an option to select avatar view (Full Body, Top Half, Head)
avatar_view = st.selectbox('Choose avatar view', ["Full Body", "Top Half", "Head"], index=0)
# Adjust video cropping based on avatar view selection
if avatar_view == "Full Body":
video_crop = {
"topLeft": {"x": 0, "y": 0},
"bottomRight": {"x": 1920, "y": 1080}
}
elif avatar_view == "Top Half":
video_crop = {
"topLeft": {"x": 0, "y": 0},
"bottomRight": {"x": 1920, "y": 540}
}
else: # Head
video_crop = {
"topLeft": {"x": 700, "y": 0},
"bottomRight": {"x": 1220, "y": 500}
}
submit_button = st.button("Submit Job")
if submit_button:
if text_input.strip() and output_container_url.strip():
# Check if selected style is valid for the character
if selected_style in character_styles[selected_character]:
with st.spinner("Processing..."):
job_id = submit_synthesis(text_input, voice, selected_character, selected_style, output_container_url, video_crop)
if job_id:
st.success(f'Job submitted! Job ID: {job_id}')
st.info("Checking job status. Please wait...")
while True:
job_status = get_synthesis_status(job_id)
# Check for a valid response and process it
if job_status and job_status["status"] == "Succeeded":
st.success("Avatar video generated successfully!")
# Log the full response to ensure correct output
logger.info(f'Full job status response: {job_status}')
# Extract video URL from the response
destination_container_url = job_status['properties'].get('destinationContainerUrl', '')
relative_video_path = job_status['outputs'].get('result', '')
# Construct the full video URL
if destination_container_url and relative_video_path:
# Ensure no double slashes in the final URL
if destination_container_url.endswith('/'):
destination_container_url = destination_container_url[:-1]
video_url = f"{destination_container_url}/{relative_video_path}"
# If the video URL is valid, display the video
if video_url.startswith("http"):
st.markdown(f"Download the avatar video [here]({video_url})")
st.video(video_url)
else:
st.error("Video URL seems invalid or incomplete.")
else:
st.error("Video URL seems invalid or incomplete.")
break
elif job_status and job_status["status"] == "Failed":
st.error("Failed to generate the avatar video.")
break
else:
time.sleep(5) # Wait for a few seconds before checking again
else:
# Show a warning if the style is not compatible with the selected character
st.warning(f'The character "{selected_character}" cannot use the chosen style. '
f'Please choose from the available styles: {", ".join(available_styles)}')
else:
st.error("Please provide the text and output container URL.")
When i say I'm not getting the timed script file I'm refering to the [nnnn].word.json file, which I saw in the batch synthesis dumentation:
""" If sentence boundary data was requested ("sentenceBoundaryEnabled": true
), then a corresponding [nnnn].sentence.json
file is included in the results. Likewise, if word boundary data was requested ("wordBoundaryEnabled": true
), then a corresponding [nnnn].word.json
file is included in the results.
Here's an example word data file with both audio offset and duration in milliseconds:
[ { "Text": "The", "AudioOffset": 50, "Duration": 137 }, { "Text": "rainbow", "AudioOffset": 200, "Duration": 350 }, { "Text": "has", "AudioOffset": 562, "Duration": 175 }, { "Text": "seven", "AudioOffset": 750, "Duration": 300 }, { "Text": "colors", "AudioOffset": 1062, "Duration": 625 }, { "Text": ".", "AudioOffset": 1700, "Duration": 100 } ]
""" which I found here: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-synthesis#batch-synthesis-results