* talk-llama: pass file instead of arg
it is too hard to quote text in a portable way
* talk-llama: pass heard_ok as a file
* talk-llama: let eleven-labs.py accept options
Options: -v voice, -s savefile, -p (--play)
* talk-llama: check installed commands in "speak"
Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed
* talk-llama: pass voice_id again
in order to sync talk with talk-llama
* talk: sync with talk-llama
Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/
59036879/45375
* talk and talk-llama: get all installed voices in speak.ps1
* talk and talk-llama: get voices from api
* talk and talk-llama: add more options to eleven-labs.py
and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)
```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]
options:
-q, --quick skip checking the required library
action:
TEXTFILE read the text file (default: stdin)
-l, --list show the list of voices and exit
-h, --help show this help and exit
voice selection:
-n NAME, --name NAME get a voice object by name (default: Arnold)
-v NUMBER, --voice NUMBER
get a voice object by number (see --list)
-f KEY=VAL, --filter KEY=VAL
filter voices by labels (default: "use case=narration")
this option can be used multiple times
filtering will be disabled if the first -f has no "=" (e.g. -f "any")
output:
-s FILE, --save FILE save the TTS to a file (default: audio.mp3)
-p, --play play the TTS with ffplay
```
* examples: add speak_with_file()
as suggested in the review
* talk and talk-llama: ignore to_speak.txt
std::ifstream infile(fileName);
return infile.good();
}
+
+bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
+{
+ std::ofstream speak_file(path.c_str());
+ if (speak_file.fail()) {
+ fprintf(stderr, "%s: failed to open speak_file\n", __func__);
+ return false;
+ } else {
+ speak_file.write(text.c_str(), text.size());
+ speak_file.close();
+ int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
+ if (ret != 0) {
+ fprintf(stderr, "%s: failed to speak\n", __func__);
+ return false;
+ }
+ }
+ return true;
+}
// check if file exists using ifstream
bool is_file_exist(const char *fileName);
+
+// write text to file, and call system("command voice_id file")
+bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
import sys
-import importlib.util
+import argparse
+import textwrap
-if importlib.util.find_spec("elevenlabs") is None:
- print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
- sys.exit()
+parser = argparse.ArgumentParser(add_help=False,
+ formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument("-q", "--quick", action="store_true",
+ help="skip checking the required library")
+
+modes = parser.add_argument_group("action")
+modes.add_argument("inputfile", metavar="TEXTFILE",
+ nargs='?', type=argparse.FileType(), default=sys.stdin,
+ help="read the text file (default: stdin)")
+modes.add_argument("-l", "--list", action="store_true",
+ help="show the list of voices and exit")
+modes.add_argument("-h", "--help", action="help",
+ help="show this help and exit")
+
+selopts = parser.add_argument_group("voice selection")
+selmodes = selopts.add_mutually_exclusive_group()
+selmodes.add_argument("-n", "--name",
+ default="Arnold",
+ help="get a voice object by name (default: Arnold)")
+selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
+ help="get a voice object by number (see --list)")
+selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
+ default=["use case=narration"],
+ help=textwrap.dedent('''\
+ filter voices by labels (default: "use case=narration")
+ this option can be used multiple times
+ filtering will be disabled if the first -f has no "=" (e.g. -f "any")
+ '''))
+
+outmodes = parser.add_argument_group("output")
+outgroup = outmodes.add_mutually_exclusive_group()
+outgroup.add_argument("-s", "--save", metavar="FILE",
+ default="audio.mp3",
+ help="save the TTS to a file (default: audio.mp3)")
+outgroup.add_argument("-p", "--play", action="store_true",
+ help="play the TTS with ffplay")
+
+args = parser.parse_args()
-from elevenlabs import generate, play, save
+if not args.quick:
+ import importlib.util
+ if importlib.util.find_spec("elevenlabs") is None:
+ print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
+ sys.exit()
-# Get a Voice object, by name or UUID
-voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
+from elevenlabs import voices, generate, play, save
+
+if args.filter and "=" in args.filter[0]:
+ voicelist = voices()
+ for f in args.filter:
+ label, value = f.split("=")
+ voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
+ voicelist = list(voicelist)
+else:
+ voicelist = list(voices())
+
+if args.list:
+ for i, v in enumerate(voicelist):
+ print(str(i) + ": " + v.name + " " + str(v.labels))
+ sys.exit()
+
+if args.voice:
+ voice = voicelist[args.voice % len(voicelist)]
+else:
+ voice = args.name
+ # if -n should consult -f, use the following
+ #voice = next(x for x in voicelist if x.name == args.name)
-# Generate the TTS
audio = generate(
- text=str(sys.argv[2:]),
- voice=voice
+ text=str(args.inputfile.read()),
+ voice=voice
)
-
-# Save the TTS to a file
-save(audio, "audio.mp3")
+if args.play:
+ play(audio)
+else:
+ save(audio, args.save)
#!/bin/bash
# Usage:
-# speak.sh <voice_id> <text-to-speak>
-
-# espeak
-# Mac OS: brew install espeak
-# Linux: apt-get install espeak
-#
-#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# piper
-#
-# https://github.com/rhasspy/piper
-#
-# Tested with Linux:
-#
-#echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
+# speak <voice_id> <textfile>
+
+function installed() { command -v $1 >/dev/null 2>&1; }
+
+if installed espeak; then
+ espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
+
+elif installed piper && installed aplay; then
+ cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
# for Mac
-say "$2"
+elif installed say; then
+ say -f $2
# Eleven Labs
-# To use it, install the elevenlabs module from pip (pip install elevenlabs)
-# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
-#Keep the line commented to use the free version whitout api key
-#
-#export ELEVEN_API_KEY=your_api_key
-#wd=$(dirname $0)
-#script=$wd/eleven-labs.py
-#python3 $script $1 "$2" >/dev/null 2>&1
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
+elif installed python3 && \
+ python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
+ installed ffplay; then
+ # It's possible to use the API for free with limited number of characters.
+ # To increase this limit register to https://beta.elevenlabs.io to get an api key
+ # and paste it after 'ELEVEN_API_KEY='
+ # Keep the line commented to use the free version without api key
+ #export ELEVEN_API_KEY=your_api_key
+ wd=$(dirname $0)
+ script=$wd/eleven-labs.py
+ python3 $script -q -p -v $1 $2 >/dev/null 2>&1
+
+ # Uncomment to keep the audio file
+ #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
+ #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
+
+else
+ echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
+ echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
+ echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
+ echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
+fi
-@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2\r
+@powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2\r
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser\r
param(\r
- # voice options are David or Zira\r
- [Parameter(Mandatory=$true)][string]$voice,\r
- [Parameter(Mandatory=$true)][string]$text\r
+ [Parameter(Mandatory=$true)][int]$voicenum,\r
+ [Parameter(Mandatory=$true)][string]$textfile\r
)\r
\r
Add-Type -AssemblyName System.Speech;\r
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;\r
-$speak.SelectVoice("Microsoft $voice Desktop");\r
+$voiceoptions = $speak.GetInstalledVoices("en-US");\r
+$voice = $voiceoptions[$voicenum % $voiceoptions.count];\r
+$speak.SelectVoice($voice.VoiceInfo.Name);\r
$speak.Rate="0";\r
+$text = Get-Content -Path $textfile;\r
$speak.Speak($text);\r
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_llama = "models/ggml-llama-7B.bin";
std::string speak = "./examples/talk-llama/speak";
+ std::string speak_file = "./examples/talk-llama/to_speak.txt";
std::string prompt = "";
std::string fname_out;
std::string path_session = ""; // path to file for saving/loading model eval state
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
+ else if (arg == "-sf" || arg == "--speak-file") { params.speak_file = argv[++i]; }
else if (arg == "--prompt-file") {
std::ifstream file(argv[++i]);
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
+ fprintf(stderr, " -sf FILE, --speak-file [%-7s] file to pass to TTS\n", params.speak_file.c_str());
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
// optionally give audio feedback that the current text is being processed
if (!params.heard_ok.empty()) {
- int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + params.heard_ok + "'").c_str());
- if (ret != 0) {
- fprintf(stderr, "%s: failed to speak\n", __func__);
- }
+ speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id);
}
// remove text between brackets using regex
}
}
- text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
- int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
- if (ret != 0) {
- fprintf(stderr, "%s: failed to speak\n", __func__);
- }
+ speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
audio.clear();
}
import sys
-import importlib.util
+import argparse
+import textwrap
-if importlib.util.find_spec("elevenlabs") is None:
- print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
- sys.exit()
+parser = argparse.ArgumentParser(add_help=False,
+ formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument("-q", "--quick", action="store_true",
+ help="skip checking the required library")
+
+modes = parser.add_argument_group("action")
+modes.add_argument("inputfile", metavar="TEXTFILE",
+ nargs='?', type=argparse.FileType(), default=sys.stdin,
+ help="read the text file (default: stdin)")
+modes.add_argument("-l", "--list", action="store_true",
+ help="show the list of voices and exit")
+modes.add_argument("-h", "--help", action="help",
+ help="show this help and exit")
+
+selopts = parser.add_argument_group("voice selection")
+selmodes = selopts.add_mutually_exclusive_group()
+selmodes.add_argument("-n", "--name",
+ default="Arnold",
+ help="get a voice object by name (default: Arnold)")
+selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
+ help="get a voice object by number (see --list)")
+selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
+ default=["use case=narration"],
+ help=textwrap.dedent('''\
+ filter voices by labels (default: "use case=narration")
+ this option can be used multiple times
+ filtering will be disabled if the first -f has no "=" (e.g. -f "any")
+ '''))
+
+outmodes = parser.add_argument_group("output")
+outgroup = outmodes.add_mutually_exclusive_group()
+outgroup.add_argument("-s", "--save", metavar="FILE",
+ default="audio.mp3",
+ help="save the TTS to a file (default: audio.mp3)")
+outgroup.add_argument("-p", "--play", action="store_true",
+ help="play the TTS with ffplay")
+
+args = parser.parse_args()
-from elevenlabs import generate, play, save
+if not args.quick:
+ import importlib.util
+ if importlib.util.find_spec("elevenlabs") is None:
+ print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
+ sys.exit()
-# Get a Voice object, by name or UUID
-voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
+from elevenlabs import voices, generate, play, save
+
+if args.filter and "=" in args.filter[0]:
+ voicelist = voices()
+ for f in args.filter:
+ label, value = f.split("=")
+ voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
+ voicelist = list(voicelist)
+else:
+ voicelist = list(voices())
+
+if args.list:
+ for i, v in enumerate(voicelist):
+ print(str(i) + ": " + v.name + " " + str(v.labels))
+ sys.exit()
+
+if args.voice:
+ voice = voicelist[args.voice % len(voicelist)]
+else:
+ voice = args.name
+ # if -n should consult -f, use the following
+ #voice = next(x for x in voicelist if x.name == args.name)
-# Generate the TTS
audio = generate(
- text=str(sys.argv[2:]),
- voice=voice
+ text=str(args.inputfile.read()),
+ voice=voice
)
-
-# Save the TTS to a file
-save(audio, "audio.mp3")
+if args.play:
+ play(audio)
+else:
+ save(audio, args.save)
#!/bin/bash
# Usage:
-# speak.sh <voice_id> <text-to-speak>
+# speak <voice_id> <textfile>
-# espeak
-# Mac OS: brew install espeak
-# Linux: apt-get install espeak
-#
-#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
+function installed() { command -v $1 >/dev/null 2>&1; }
-# Mac OS "say" command
-say "$2"
+if installed espeak; then
+ espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
+
+elif installed piper && installed aplay; then
+ cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
+
+# for Mac
+elif installed say; then
+ say -f $2
# Eleven Labs
-# To use it, install the elevenlabs module from pip (pip install elevenlabs)
-# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
-#Keep the line commented to use the free version without api key
-#
-#export ELEVEN_API_KEY=your_api_key
-#wd=$(dirname $0)
-#script=$wd/eleven-labs.py
-#python3 $script $1 "$2"
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3
+elif installed python3 && \
+ python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
+ installed ffplay; then
+ # It's possible to use the API for free with limited number of characters.
+ # To increase this limit register to https://beta.elevenlabs.io to get an api key
+ # and paste it after 'ELEVEN_API_KEY='
+ # Keep the line commented to use the free version without api key
+ #export ELEVEN_API_KEY=your_api_key
+ wd=$(dirname $0)
+ script=$wd/eleven-labs.py
+ python3 $script -q -p -v $1 $2 >/dev/null 2>&1
+
+ # Uncomment to keep the audio file
+ #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
+ #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
+
+else
+ echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
+ echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
+ echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
+ echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
+fi
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser\r
param(\r
- # voice options are David or Zira\r
- [Parameter(Mandatory=$true)][string]$voice,\r
- [Parameter(Mandatory=$true)][string]$text\r
+ [Parameter(Mandatory=$true)][int]$voicenum,\r
+ [Parameter(Mandatory=$true)][string]$textfile\r
)\r
\r
Add-Type -AssemblyName System.Speech;\r
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;\r
-$speak.SelectVoice("Microsoft $voice Desktop");\r
+$voiceoptions = $speak.GetInstalledVoices("en-US");\r
+$voice = $voiceoptions[$voicenum % $voiceoptions.count];\r
+$speak.SelectVoice($voice.VoiceInfo.Name);\r
$speak.Rate="0";\r
+$text = Get-Content -Path $textfile;\r
$speak.Speak($text);\r
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_gpt = "models/ggml-gpt-2-117M.bin";
std::string speak = "./examples/talk/speak";
+ std::string speak_file= "./examples/talk/to_speak.txt";
std::string fname_out;
};
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
+ else if (arg == "-sf" || arg == "--speak_file") { params.speak_file = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
+ fprintf(stderr, " -sf FILE, --speak_file [%-7s] file to pass to TTS\n", params.speak_file.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, "\n");
}
std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
- text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
+ //text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
// remove first 2 lines of base prompt
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
- int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
- if (ret != 0) {
- fprintf(stderr, "%s: system() failed!\n", __func__);
- }
+ speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
audio.clear();