[mimictts] Fix ssml and playing from audiosinks using the audio servlet (#14120)
* [mimictts] Fix ssml and playing from an audiosink using the audio servlet Fix : - ssml not working - add an option to store the audio on a file before sending it to openhab. It enables audiosink based on the audio servlet to play the sound (the servlet requires the getClonedStream method, unavailable with a pure streaming approach). The files are stored in the user data directory and deleted as soon as possible (stream close detection). - fix error with voice name not encoded Signed-off-by: Gwendal Roulleau <gwendal.roulleau@gmail.com>
This commit is contained in:
@@ -17,6 +17,7 @@ It supports a subset of SSML, and if you want to use it, be sure to start your t
|
|||||||
Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set:
|
Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set:
|
||||||
|
|
||||||
* **url** - Mimic URL. Default to `http://localhost:59125`
|
* **url** - Mimic URL. Default to `http://localhost:59125`
|
||||||
|
* **workaroundServletSink** - A boolean activating a workaround for audiosink using the openHAB servlet. It stores audio file temporarily on disk, allowing the servlet to get a cloned stream as needed. Default false.
|
||||||
* **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
* **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
||||||
* **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models.
|
* **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models.
|
||||||
* **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
* **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
||||||
|
|||||||
@@ -0,0 +1,84 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (c) 2010-2023 Contributors to the openHAB project
|
||||||
|
*
|
||||||
|
* See the NOTICE file(s) distributed with this work for additional
|
||||||
|
* information.
|
||||||
|
*
|
||||||
|
* This program and the accompanying materials are made available under the
|
||||||
|
* terms of the Eclipse Public License 2.0 which is available at
|
||||||
|
* http://www.eclipse.org/legal/epl-2.0
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: EPL-2.0
|
||||||
|
*/
|
||||||
|
package org.openhab.voice.mimic.internal;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.eclipse.jdt.annotation.NonNullByDefault;
|
||||||
|
import org.openhab.core.audio.AudioException;
|
||||||
|
import org.openhab.core.audio.AudioFormat;
|
||||||
|
import org.openhab.core.audio.FileAudioStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A FileAudioStream that autodelete after it and its clone are closed
|
||||||
|
* Useful to not congest temporary directory
|
||||||
|
*
|
||||||
|
* @author Gwendal Roulleau - Initial contribution
|
||||||
|
*/
|
||||||
|
@NonNullByDefault
|
||||||
|
public class AutoDeleteFileAudioStream extends FileAudioStream {
|
||||||
|
|
||||||
|
private final File file;
|
||||||
|
private final AudioFormat audioFormat;
|
||||||
|
private final List<ClonedFileInputStream> clonedAudioStreams = new ArrayList<>(1);
|
||||||
|
private boolean isOpen = true;
|
||||||
|
|
||||||
|
public AutoDeleteFileAudioStream(File file, AudioFormat format) throws AudioException {
|
||||||
|
super(file, format);
|
||||||
|
this.file = file;
|
||||||
|
this.audioFormat = format;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
super.close();
|
||||||
|
this.isOpen = false;
|
||||||
|
deleteIfPossible();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void deleteIfPossible() {
|
||||||
|
boolean aClonedStreamIsOpen = clonedAudioStreams.stream().anyMatch(as -> as.isOpen);
|
||||||
|
if (!isOpen && !aClonedStreamIsOpen) {
|
||||||
|
file.delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InputStream getClonedStream() throws AudioException {
|
||||||
|
ClonedFileInputStream clonedInputStream = new ClonedFileInputStream(this, file, audioFormat);
|
||||||
|
clonedAudioStreams.add(clonedInputStream);
|
||||||
|
return clonedInputStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class ClonedFileInputStream extends FileAudioStream {
|
||||||
|
protected boolean isOpen = true;
|
||||||
|
private final AutoDeleteFileAudioStream parent;
|
||||||
|
|
||||||
|
public ClonedFileInputStream(AutoDeleteFileAudioStream parent, File file, AudioFormat audioFormat)
|
||||||
|
throws AudioException {
|
||||||
|
super(file, audioFormat);
|
||||||
|
this.parent = parent;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
super.close();
|
||||||
|
this.isOpen = false;
|
||||||
|
parent.deleteIfPossible();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -25,4 +25,5 @@ public class MimicConfiguration {
|
|||||||
public Double speakingRate = 1.0;
|
public Double speakingRate = 1.0;
|
||||||
public Double audioVolatility = 0.667;
|
public Double audioVolatility = 0.667;
|
||||||
public Double phonemeVolatility = 0.8;
|
public Double phonemeVolatility = 0.8;
|
||||||
|
public Boolean workaroundServletSink = false;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,13 +12,20 @@
|
|||||||
*/
|
*/
|
||||||
package org.openhab.voice.mimic.internal;
|
package org.openhab.voice.mimic.internal;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.UUID;
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
@@ -31,6 +38,8 @@ import org.eclipse.jetty.client.util.InputStreamResponseListener;
|
|||||||
import org.eclipse.jetty.client.util.StringContentProvider;
|
import org.eclipse.jetty.client.util.StringContentProvider;
|
||||||
import org.eclipse.jetty.http.HttpHeader;
|
import org.eclipse.jetty.http.HttpHeader;
|
||||||
import org.eclipse.jetty.http.HttpStatus;
|
import org.eclipse.jetty.http.HttpStatus;
|
||||||
|
import org.openhab.core.OpenHAB;
|
||||||
|
import org.openhab.core.audio.AudioException;
|
||||||
import org.openhab.core.audio.AudioFormat;
|
import org.openhab.core.audio.AudioFormat;
|
||||||
import org.openhab.core.audio.AudioStream;
|
import org.openhab.core.audio.AudioStream;
|
||||||
import org.openhab.core.config.core.ConfigurableService;
|
import org.openhab.core.config.core.ConfigurableService;
|
||||||
@@ -75,6 +84,7 @@ public class MimicTTSService implements TTSService {
|
|||||||
* Configuration parameters
|
* Configuration parameters
|
||||||
*/
|
*/
|
||||||
private static final String PARAM_URL = "url";
|
private static final String PARAM_URL = "url";
|
||||||
|
private static final String PARAM_WORKAROUNDSERVLETSINK = "workaroundServletSink";
|
||||||
private static final String PARAM_SPEAKINGRATE = "speakingRate";
|
private static final String PARAM_SPEAKINGRATE = "speakingRate";
|
||||||
private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility";
|
private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility";
|
||||||
private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility";
|
private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility";
|
||||||
@@ -120,6 +130,12 @@ public class MimicTTSService implements TTSService {
|
|||||||
config.url = param.toString();
|
config.url = param.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// workaround
|
||||||
|
param = newConfig.get(PARAM_WORKAROUNDSERVLETSINK);
|
||||||
|
if (param != null) {
|
||||||
|
config.workaroundServletSink = Boolean.parseBoolean(param.toString());
|
||||||
|
}
|
||||||
|
|
||||||
// audio volatility
|
// audio volatility
|
||||||
try {
|
try {
|
||||||
param = newConfig.get(PARAM_AUDIOVOLATITLITY);
|
param = newConfig.get(PARAM_AUDIOVOLATITLITY);
|
||||||
@@ -232,22 +248,29 @@ public class MimicTTSService implements TTSService {
|
|||||||
throw new TTSException("The passed AudioFormat is unsupported");
|
throw new TTSException("The passed AudioFormat is unsupported");
|
||||||
}
|
}
|
||||||
|
|
||||||
String ssml = "";
|
String encodedVoice;
|
||||||
if (text.startsWith("<speak>")) {
|
try {
|
||||||
ssml = "&ssml=true";
|
encodedVoice = URLEncoder.encode(((MimicVoice) voice).getTechnicalName(),
|
||||||
|
StandardCharsets.UTF_8.toString());
|
||||||
|
} catch (UnsupportedEncodingException e) {
|
||||||
|
throw new IllegalArgumentException("Cannot encode voice in URL " + ((MimicVoice) voice).getTechnicalName());
|
||||||
}
|
}
|
||||||
|
|
||||||
// create the url for given locale, format
|
// create the url for given locale, format
|
||||||
String urlTTS = config.url + SYNTHETIZE_URL + "?voice=" + ((MimicVoice) voice).getTechnicalName() + ssml
|
String urlTTS = config.url + SYNTHETIZE_URL + "?voice=" + encodedVoice + "&noiseScale=" + config.audioVolatility
|
||||||
+ "&noiseScale=" + config.audioVolatility + "&noiseW=" + config.phonemeVolatility + "&lengthScale="
|
+ "&noiseW=" + config.phonemeVolatility + "&lengthScale=" + config.speakingRate + "&audioTarget=client";
|
||||||
+ config.speakingRate + "&audioTarget=client";
|
|
||||||
logger.debug("Querying mimic with URL {}", urlTTS);
|
logger.debug("Querying mimic with URL {}", urlTTS);
|
||||||
|
|
||||||
// prepare the response as an inputstream
|
// prepare the response as an inputstream
|
||||||
InputStreamResponseListener inputStreamResponseListener = new InputStreamResponseListener();
|
InputStreamResponseListener inputStreamResponseListener = new InputStreamResponseListener();
|
||||||
// we will use a POST method for the text
|
// we will use a POST method for the text
|
||||||
StringContentProvider textContentProvider = new StringContentProvider(text, StandardCharsets.UTF_8);
|
StringContentProvider textContentProvider = new StringContentProvider(text, StandardCharsets.UTF_8);
|
||||||
httpClient.POST(urlTTS).content(textContentProvider).accept("audio/wav").send(inputStreamResponseListener);
|
if (text.startsWith("<speak>")) {
|
||||||
|
httpClient.POST(urlTTS).header("Content-Type", "application/ssml+xml").content(textContentProvider)
|
||||||
|
.accept("audio/wav").send(inputStreamResponseListener);
|
||||||
|
} else {
|
||||||
|
httpClient.POST(urlTTS).content(textContentProvider).accept("audio/wav").send(inputStreamResponseListener);
|
||||||
|
}
|
||||||
|
|
||||||
// compute the estimated timeout using a "stupid" method based on text length, as the response time depends on
|
// compute the estimated timeout using a "stupid" method based on text length, as the response time depends on
|
||||||
// the requested text. Average speaker speed estimated to 10/second.
|
// the requested text. Average speaker speed estimated to 10/second.
|
||||||
@@ -269,7 +292,26 @@ public class MimicTTSService implements TTSService {
|
|||||||
"Cannot get Content-Length header from mimic response. Are you sure to query a mimic TTS server at "
|
"Cannot get Content-Length header from mimic response. Are you sure to query a mimic TTS server at "
|
||||||
+ urlTTS + " ?");
|
+ urlTTS + " ?");
|
||||||
}
|
}
|
||||||
return new InputStreamAudioStream(inputStreamResponseListener.getInputStream(), AUDIO_FORMAT, length);
|
|
||||||
|
InputStream inputStreamFromMimic = inputStreamResponseListener.getInputStream();
|
||||||
|
try {
|
||||||
|
if (!config.workaroundServletSink) {
|
||||||
|
return new InputStreamAudioStream(inputStreamFromMimic, AUDIO_FORMAT, length);
|
||||||
|
} else {
|
||||||
|
// Some audio sinks use the openHAB servlet to get audio. This servlet require the
|
||||||
|
// getClonedStream()
|
||||||
|
// method
|
||||||
|
// So we cache the file on disk, thus implementing the method thanks to FileAudioStream.
|
||||||
|
return createTemporaryFile(inputStreamFromMimic, AUDIO_FORMAT);
|
||||||
|
}
|
||||||
|
} catch (TTSException e) {
|
||||||
|
try {
|
||||||
|
inputStreamFromMimic.close();
|
||||||
|
} catch (IOException e1) {
|
||||||
|
}
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
String errorMessage = "Cannot get wav from mimic url " + urlTTS + " with HTTP response code "
|
String errorMessage = "Cannot get wav from mimic url " + urlTTS + " with HTTP response code "
|
||||||
+ response.getStatus() + " for reason " + response.getReason();
|
+ response.getStatus() + " for reason " + response.getReason();
|
||||||
@@ -282,4 +324,17 @@ public class MimicTTSService implements TTSService {
|
|||||||
throw new TTSException(errorMessage, e);
|
throw new TTSException(errorMessage, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private AudioStream createTemporaryFile(InputStream inputStream, AudioFormat audioFormat) throws TTSException {
|
||||||
|
File mimicDirectory = new File(OpenHAB.getUserDataFolder(), "mimic");
|
||||||
|
mimicDirectory.mkdir();
|
||||||
|
try {
|
||||||
|
File tempFile = File.createTempFile(UUID.randomUUID().toString(), ".wav", mimicDirectory);
|
||||||
|
tempFile.deleteOnExit();
|
||||||
|
Files.copy(inputStream, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
return new AutoDeleteFileAudioStream(tempFile, audioFormat);
|
||||||
|
} catch (AudioException | IOException e) {
|
||||||
|
throw new TTSException("Cannot create temporary audio file", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,12 @@
|
|||||||
<description>Mimic 3 URL.</description>
|
<description>Mimic 3 URL.</description>
|
||||||
<default>http://localhost:59125</default>
|
<default>http://localhost:59125</default>
|
||||||
</parameter>
|
</parameter>
|
||||||
|
<parameter name="workaroundServletSink" type="boolean" required="false">
|
||||||
|
<label>Workaround For Servlet-Based Audiosink</label>
|
||||||
|
<description>Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on
|
||||||
|
the openHAB audio servlet.</description>
|
||||||
|
<default>false</default>
|
||||||
|
</parameter>
|
||||||
<parameter name="speakingRate" min="0" max="1" type="decimal" required="false">
|
<parameter name="speakingRate" min="0" max="1" type="decimal" required="false">
|
||||||
<label>Speaking Rate</label>
|
<label>Speaking Rate</label>
|
||||||
<description>Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less
|
<description>Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ voice.config.mimictts.phonemeVolatility.label = Phoneme Volatility
|
|||||||
voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
||||||
voice.config.mimictts.speakingRate.label = Speaking Rate
|
voice.config.mimictts.speakingRate.label = Speaking Rate
|
||||||
voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
||||||
|
voice.config.mimictts.workaroundServletSink.label= Workaround For Servlet-Based Audiosink
|
||||||
|
voice.config.mimictts.workaroundServletSink.description= Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on the openHAB audio servlet.
|
||||||
voice.config.mimictts.url.label = URL
|
voice.config.mimictts.url.label = URL
|
||||||
voice.config.mimictts.url.description = Mimic 3 URL.
|
voice.config.mimictts.url.description = Mimic 3 URL.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user