From d497defe34c1ea21fb6b1932969df800fe90b9dc Mon Sep 17 00:00:00 2001 From: Gwendal Roulleau Date: Sat, 14 Jan 2023 09:39:59 +0100 Subject: [PATCH] [mimictts] Fix ssml and playing from audiosinks using the audio servlet (#14120) * [mimictts] Fix ssml and playing from an audiosink using the audio servlet Fix : - ssml not working - add an option to store the audio on a file before sending it to openhab. It enables audiosink based on the audio servlet to play the sound (the servlet requires the getClonedStream method, unavailable with a pure streaming approach). The files are stored in the user data directory and deleted as soon as possible (stream close detection). - fix error with voice name not encoded Signed-off-by: Gwendal Roulleau --- bundles/org.openhab.voice.mimictts/README.md | 1 + .../internal/AutoDeleteFileAudioStream.java | 84 +++++++++++++++++++ .../mimic/internal/MimicConfiguration.java | 1 + .../voice/mimic/internal/MimicTTSService.java | 71 ++++++++++++++-- .../main/resources/OH-INF/config/config.xml | 6 ++ .../resources/OH-INF/i18n/mimictts.properties | 2 + 6 files changed, 157 insertions(+), 8 deletions(-) create mode 100644 bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/AutoDeleteFileAudioStream.java diff --git a/bundles/org.openhab.voice.mimictts/README.md b/bundles/org.openhab.voice.mimictts/README.md index 265890c8d..b5d572a25 100644 --- a/bundles/org.openhab.voice.mimictts/README.md +++ b/bundles/org.openhab.voice.mimictts/README.md @@ -17,6 +17,7 @@ It supports a subset of SSML, and if you want to use it, be sure to start your t Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set: * **url** - Mimic URL. Default to `http://localhost:59125` +* **workaroundServletSink** - A boolean activating a workaround for audiosink using the openHAB servlet. It stores audio file temporarily on disk, allowing the servlet to get a cloned stream as needed. Default false. * **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower. * **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models. * **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models. diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/AutoDeleteFileAudioStream.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/AutoDeleteFileAudioStream.java new file mode 100644 index 000000000..465a2b2bd --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/AutoDeleteFileAudioStream.java @@ -0,0 +1,84 @@ +/** + * Copyright (c) 2010-2023 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.mimic.internal; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.eclipse.jdt.annotation.NonNullByDefault; +import org.openhab.core.audio.AudioException; +import org.openhab.core.audio.AudioFormat; +import org.openhab.core.audio.FileAudioStream; + +/** + * A FileAudioStream that autodelete after it and its clone are closed + * Useful to not congest temporary directory + * + * @author Gwendal Roulleau - Initial contribution + */ +@NonNullByDefault +public class AutoDeleteFileAudioStream extends FileAudioStream { + + private final File file; + private final AudioFormat audioFormat; + private final List clonedAudioStreams = new ArrayList<>(1); + private boolean isOpen = true; + + public AutoDeleteFileAudioStream(File file, AudioFormat format) throws AudioException { + super(file, format); + this.file = file; + this.audioFormat = format; + } + + @Override + public void close() throws IOException { + super.close(); + this.isOpen = false; + deleteIfPossible(); + } + + protected void deleteIfPossible() { + boolean aClonedStreamIsOpen = clonedAudioStreams.stream().anyMatch(as -> as.isOpen); + if (!isOpen && !aClonedStreamIsOpen) { + file.delete(); + } + } + + @Override + public InputStream getClonedStream() throws AudioException { + ClonedFileInputStream clonedInputStream = new ClonedFileInputStream(this, file, audioFormat); + clonedAudioStreams.add(clonedInputStream); + return clonedInputStream; + } + + private static class ClonedFileInputStream extends FileAudioStream { + protected boolean isOpen = true; + private final AutoDeleteFileAudioStream parent; + + public ClonedFileInputStream(AutoDeleteFileAudioStream parent, File file, AudioFormat audioFormat) + throws AudioException { + super(file, audioFormat); + this.parent = parent; + } + + @Override + public void close() throws IOException { + super.close(); + this.isOpen = false; + parent.deleteIfPossible(); + } + } +} diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java index e8c56c146..e35064e39 100644 --- a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java +++ b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java @@ -25,4 +25,5 @@ public class MimicConfiguration { public Double speakingRate = 1.0; public Double audioVolatility = 0.667; public Double phonemeVolatility = 0.8; + public Boolean workaroundServletSink = false; } diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java index abde4cb0f..39364035c 100644 --- a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java +++ b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java @@ -12,13 +12,20 @@ */ package org.openhab.voice.mimic.internal; +import java.io.File; import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.UUID; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -31,6 +38,8 @@ import org.eclipse.jetty.client.util.InputStreamResponseListener; import org.eclipse.jetty.client.util.StringContentProvider; import org.eclipse.jetty.http.HttpHeader; import org.eclipse.jetty.http.HttpStatus; +import org.openhab.core.OpenHAB; +import org.openhab.core.audio.AudioException; import org.openhab.core.audio.AudioFormat; import org.openhab.core.audio.AudioStream; import org.openhab.core.config.core.ConfigurableService; @@ -75,6 +84,7 @@ public class MimicTTSService implements TTSService { * Configuration parameters */ private static final String PARAM_URL = "url"; + private static final String PARAM_WORKAROUNDSERVLETSINK = "workaroundServletSink"; private static final String PARAM_SPEAKINGRATE = "speakingRate"; private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility"; private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility"; @@ -120,6 +130,12 @@ public class MimicTTSService implements TTSService { config.url = param.toString(); } + // workaround + param = newConfig.get(PARAM_WORKAROUNDSERVLETSINK); + if (param != null) { + config.workaroundServletSink = Boolean.parseBoolean(param.toString()); + } + // audio volatility try { param = newConfig.get(PARAM_AUDIOVOLATITLITY); @@ -232,22 +248,29 @@ public class MimicTTSService implements TTSService { throw new TTSException("The passed AudioFormat is unsupported"); } - String ssml = ""; - if (text.startsWith("")) { - ssml = "&ssml=true"; + String encodedVoice; + try { + encodedVoice = URLEncoder.encode(((MimicVoice) voice).getTechnicalName(), + StandardCharsets.UTF_8.toString()); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("Cannot encode voice in URL " + ((MimicVoice) voice).getTechnicalName()); } // create the url for given locale, format - String urlTTS = config.url + SYNTHETIZE_URL + "?voice=" + ((MimicVoice) voice).getTechnicalName() + ssml - + "&noiseScale=" + config.audioVolatility + "&noiseW=" + config.phonemeVolatility + "&lengthScale=" - + config.speakingRate + "&audioTarget=client"; + String urlTTS = config.url + SYNTHETIZE_URL + "?voice=" + encodedVoice + "&noiseScale=" + config.audioVolatility + + "&noiseW=" + config.phonemeVolatility + "&lengthScale=" + config.speakingRate + "&audioTarget=client"; logger.debug("Querying mimic with URL {}", urlTTS); // prepare the response as an inputstream InputStreamResponseListener inputStreamResponseListener = new InputStreamResponseListener(); // we will use a POST method for the text StringContentProvider textContentProvider = new StringContentProvider(text, StandardCharsets.UTF_8); - httpClient.POST(urlTTS).content(textContentProvider).accept("audio/wav").send(inputStreamResponseListener); + if (text.startsWith("")) { + httpClient.POST(urlTTS).header("Content-Type", "application/ssml+xml").content(textContentProvider) + .accept("audio/wav").send(inputStreamResponseListener); + } else { + httpClient.POST(urlTTS).content(textContentProvider).accept("audio/wav").send(inputStreamResponseListener); + } // compute the estimated timeout using a "stupid" method based on text length, as the response time depends on // the requested text. Average speaker speed estimated to 10/second. @@ -269,7 +292,26 @@ public class MimicTTSService implements TTSService { "Cannot get Content-Length header from mimic response. Are you sure to query a mimic TTS server at " + urlTTS + " ?"); } - return new InputStreamAudioStream(inputStreamResponseListener.getInputStream(), AUDIO_FORMAT, length); + + InputStream inputStreamFromMimic = inputStreamResponseListener.getInputStream(); + try { + if (!config.workaroundServletSink) { + return new InputStreamAudioStream(inputStreamFromMimic, AUDIO_FORMAT, length); + } else { + // Some audio sinks use the openHAB servlet to get audio. This servlet require the + // getClonedStream() + // method + // So we cache the file on disk, thus implementing the method thanks to FileAudioStream. + return createTemporaryFile(inputStreamFromMimic, AUDIO_FORMAT); + } + } catch (TTSException e) { + try { + inputStreamFromMimic.close(); + } catch (IOException e1) { + } + throw e; + } + } else { String errorMessage = "Cannot get wav from mimic url " + urlTTS + " with HTTP response code " + response.getStatus() + " for reason " + response.getReason(); @@ -282,4 +324,17 @@ public class MimicTTSService implements TTSService { throw new TTSException(errorMessage, e); } } + + private AudioStream createTemporaryFile(InputStream inputStream, AudioFormat audioFormat) throws TTSException { + File mimicDirectory = new File(OpenHAB.getUserDataFolder(), "mimic"); + mimicDirectory.mkdir(); + try { + File tempFile = File.createTempFile(UUID.randomUUID().toString(), ".wav", mimicDirectory); + tempFile.deleteOnExit(); + Files.copy(inputStream, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + return new AutoDeleteFileAudioStream(tempFile, audioFormat); + } catch (AudioException | IOException e) { + throw new TTSException("Cannot create temporary audio file", e); + } + } } diff --git a/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml index ab8619a39..2107070d7 100644 --- a/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml +++ b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml @@ -11,6 +11,12 @@ Mimic 3 URL. http://localhost:59125 + + + Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on + the openHAB audio servlet. + false + Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less diff --git a/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties index f34268e20..2472d5b89 100644 --- a/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties +++ b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties @@ -4,6 +4,8 @@ voice.config.mimictts.phonemeVolatility.label = Phoneme Volatility voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models. voice.config.mimictts.speakingRate.label = Speaking Rate voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower. +voice.config.mimictts.workaroundServletSink.label= Workaround For Servlet-Based Audiosink +voice.config.mimictts.workaroundServletSink.description= Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on the openHAB audio servlet. voice.config.mimictts.url.label = URL voice.config.mimictts.url.description = Mimic 3 URL.