Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: Text to audio #9625

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/python-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ jobs:
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
BING_API_KEY: ${{ secrets.BING_API_KEY }}
Expand Down Expand Up @@ -220,6 +221,7 @@ jobs:
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
BING_API_KEY: ${{ secrets.BING_API_KEY }}
Expand Down Expand Up @@ -418,4 +420,4 @@ jobs:
dry_run: ${{ env.run_type != 'Daily' && env.run_type != 'Manual'}}
job: ${{ toJson(job) }}
steps: ${{ toJson(steps) }}
overwrite: "{title: ` ${{ env.run_type }}: ${{ env.date }} `, text: ` ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`}"
overwrite: "{title: ` ${{ env.run_type }}: ${{ env.date }} `, text: ` ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`}"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import os

from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
from samples.concepts.audio.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
OpenAIChatPromptExecutionSettings,
Expand All @@ -17,6 +17,10 @@
# to create a chat bot that can communicate with the user using audio input.
# The user can enage a long conversation with the chat bot by speaking to it.

# Resources required for this sample:
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
# 2. An Azure Speech to Text deployment (e.g. whisper).

# Additional dependencies required for this sample:
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
Expand Down
97 changes: 97 additions & 0 deletions python/samples/concepts/audio/02-chat_with_audio_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging

from samples.concepts.audio.audio_player import AudioPlayer
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
OpenAIChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
from semantic_kernel.contents import ChatHistory

# This simple sample demonstrates how to use the AzureChatCompletion and AzureTextToAudio services
# to create a chat bot that can communicate with the user using audio output.
# The chatbot will engage in a conversation with the user and respond using audio output.

# Resources required for this sample:
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
# 2. An Azure Text to Speech deployment (e.g. tts).

# Additional dependencies required for this sample:
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.


logging.basicConfig(level=logging.WARNING)

system_message = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""


chat_service = AzureChatCompletion()
text_to_audio_service = AzureTextToAudio()

history = ChatHistory()
history.add_user_message("Hi there, who are you?")
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")


async def chat() -> bool:
try:
user_input = input("User:> ")
except KeyboardInterrupt:
print("\n\nExiting chat...")
return False
except EOFError:
print("\n\nExiting chat...")
return False

if user_input == "exit":
print("\n\nExiting chat...")
return False

history.add_user_message(user_input)

# No need to stream the response since we can only pass the
# response to the text to audio service as a whole
response = await chat_service.get_chat_message_content(
chat_history=history,
settings=OpenAIChatPromptExecutionSettings(
max_tokens=2000,
temperature=0.7,
top_p=0.8,
),
)

# Need to set the response format to wav since the audio player only supports wav files
audio_content = await text_to_audio_service.get_audio_content(
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
)
AudioPlayer(audio_content=audio_content).play()

print(f"Mosscap:> {response.content}")

history.add_assistant_message(response.content)

return True


async def main() -> None:
chatting = True
while chatting:
chatting = await chat()


if __name__ == "__main__":
asyncio.run(main())
115 changes: 115 additions & 0 deletions python/samples/concepts/audio/03-chat_with_audio_input_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging
import os

from samples.concepts.audio.audio_player import AudioPlayer
from samples.concepts.audio.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
OpenAIChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.audio_content import AudioContent

# This simple sample demonstrates how to use the AzureChatCompletion, AzureTextToAudio, and AzureAudioToText
# services to create a chat bot that can communicate with the user using both audio input and output.
# The chatbot will engage in a conversation with the user by audio only.
# This sample combines the functionality of the samples/concepts/audio/01-chat_with_audio_input.py and
# samples/concepts/audio/02-chat_with_audio_output.py samples.

# Resources required for this sample:
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
# 2. An Azure Text to Speech deployment (e.g. tts).
# 3. An Azure Speech to Text deployment (e.g. whisper).

# Additional dependencies required for this sample:
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.


logging.basicConfig(level=logging.WARNING)
AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")


system_message = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""


chat_service = AzureChatCompletion()
text_to_audio_service = AzureTextToAudio()
audio_to_text_service = AzureAudioToText()

history = ChatHistory()
history.add_user_message("Hi there, who are you?")
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")


async def chat() -> bool:
try:
print("User:> ", end="", flush=True)
with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
recorder.start_recording()
user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
print(user_input.text)
except KeyboardInterrupt:
print("\n\nExiting chat...")
return False
except EOFError:
print("\n\nExiting chat...")
return False

if "exit" in user_input.text.lower():
print("\n\nExiting chat...")
return False

history.add_user_message(user_input.text)

# No need to stream the response since we can only pass the
# response to the text to audio service as a whole
response = await chat_service.get_chat_message_content(
chat_history=history,
settings=OpenAIChatPromptExecutionSettings(
max_tokens=2000,
temperature=0.7,
top_p=0.8,
),
)

# Need to set the response format to wav since the audio player only supports wav files
audio_content = await text_to_audio_service.get_audio_content(
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
)
print("Mosscap:> ", end="", flush=True)
AudioPlayer(audio_content=audio_content).play(text=response.content)

history.add_assistant_message(response.content)

return True


async def main() -> None:
print(
"Instruction: when it's your turn to speak, press the spacebar to start recording."
" Release the spacebar to stop recording."
)

chatting = True
while chatting:
chatting = await chat()


if __name__ == "__main__":
asyncio.run(main())
99 changes: 99 additions & 0 deletions python/samples/concepts/audio/audio_player.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) Microsoft. All rights reserved.

import io
import logging
import wave
from typing import ClassVar

import pyaudio

from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.kernel_pydantic import KernelBaseModel

logging.basicConfig(level=logging.WARNING)
logger: logging.Logger = logging.getLogger(__name__)


class AudioPlayer(KernelBaseModel):
"""A class to play an audio file to the default audio output device."""

# Audio replay parameters
CHUNK: ClassVar[int] = 1024

audio_content: AudioContent

def play(self, text: str | None = None) -> None:
"""Play the audio content to the default audio output device.

Args:
text (str, optional): The text to display while playing the audio. Defaults to None.
"""
audio_stream = io.BytesIO(self.audio_content.data)
with wave.open(audio_stream, "rb") as wf:
audio = pyaudio.PyAudio()
stream = audio.open(
format=audio.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
)

if text:
# Simulate the output of text while playing the audio
data_frames = []

data = wf.readframes(self.CHUNK)
while data:
data_frames.append(data)
data = wf.readframes(self.CHUNK)

if len(data_frames) < len(text):
logger.warning(
"The audio is too short to play the entire text. ",
"The text will be displayed without synchronization.",
)
print(text)
else:
for data_frame, text_frame in self._zip_text_and_audio(text, data_frames):
stream.write(data_frame)
print(text_frame, end="", flush=True)
print()
else:
data = wf.readframes(self.CHUNK)
while data:
stream.write(data)
data = wf.readframes(self.CHUNK)

stream.stop_stream()
stream.close()
audio.terminate()

def _zip_text_and_audio(self, text: str, audio_frames: list) -> zip:
"""Zip the text and audio frames together so that they can be displayed in sync.

This is done by evenly distributing empty strings between each character and
append the remaining empty strings at the end.

Args:
text (str): The text to display while playing the audio.
audio_frames (list): The audio frames to play.

Returns:
zip: The zipped text and audio frames.
"""
text_frames = list(text)
empty_string_count = len(audio_frames) - len(text_frames)
empty_string_spacing = len(text_frames) // empty_string_count

modified_text_frames = []
current_empty_string_count = 0
for i, text_frame in enumerate(text_frames):
modified_text_frames.append(text_frame)
if current_empty_string_count < empty_string_count and i % empty_string_spacing == 0:
modified_text_frames.append("")
current_empty_string_count += 1

if current_empty_string_count < empty_string_count:
modified_text_frames.extend([""] * (empty_string_count - current_empty_string_count))

return zip(audio_frames, modified_text_frames)
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ class OpenAIAudioToTextExecutionSettings(PromptExecutionSettings):
"""Request settings for OpenAI audio to text services."""

ai_model_id: str | None = Field(None, serialization_alias="model")
filename: str | None = None
filename: str | None = Field(
None, description="Do not set this manually. It is set by the service based on the audio content."
)
language: str | None = None
prompt: str | None = None
response_format: str | None = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ class OpenAIPromptExecutionSettings(PromptExecutionSettings):
class OpenAITextPromptExecutionSettings(OpenAIPromptExecutionSettings):
"""Specific settings for the completions endpoint."""

prompt: str | None = None
prompt: str | None = Field(
None, description="Do not set this manually. It is set by the service based on the text content."
)
best_of: int | None = Field(None, ge=1)
echo: bool = False
logprobs: int | None = Field(None, ge=0, le=5)
Expand Down Expand Up @@ -66,7 +68,9 @@ class OpenAIChatPromptExecutionSettings(OpenAIPromptExecutionSettings):
) = None
function_call: str | None = None
functions: list[dict[str, Any]] | None = None
messages: list[dict[str, Any]] | None = None
messages: list[dict[str, Any]] | None = Field(
None, description="Do not set this manually. It is set by the service based on the chat history."
)
function_call_behavior: FunctionCallBehavior | None = Field(None, exclude=True)
parallel_tool_calls: bool = True
tools: list[dict[str, Any]] | None = Field(
Expand Down
Loading
Loading