diff --git a/CODEOWNERS b/CODEOWNERS index 73e1ed331..968324c0c 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -390,6 +390,7 @@ /bundles/org.openhab.voice.googletts/ @gbicskei /bundles/org.openhab.voice.mactts/ @kaikreuzer /bundles/org.openhab.voice.marytts/ @kaikreuzer +/bundles/org.openhab.voice.mimictts/ @dalgwen /bundles/org.openhab.voice.actiontemplatehli/ @GiviMAD /bundles/org.openhab.voice.picotts/ @FlorianSW /bundles/org.openhab.voice.pollytts/ @hillmanr diff --git a/bom/openhab-addons/pom.xml b/bom/openhab-addons/pom.xml index 5de67e9d8..8cddb09a3 100644 --- a/bom/openhab-addons/pom.xml +++ b/bom/openhab-addons/pom.xml @@ -1951,6 +1951,11 @@ org.openhab.voice.marytts ${project.version} + + org.openhab.addons.bundles + org.openhab.voice.mimictts + ${project.version} + org.openhab.addons.bundles org.openhab.voice.actiontemplatehli diff --git a/bundles/org.openhab.voice.mimictts/NOTICE b/bundles/org.openhab.voice.mimictts/NOTICE new file mode 100644 index 000000000..38d625e34 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/NOTICE @@ -0,0 +1,13 @@ +This content is produced and maintained by the openHAB project. + +* Project home: https://www.openhab.org + +== Declared Project Licenses + +This program and the accompanying materials are made available under the terms +of the Eclipse Public License 2.0 which is available at +https://www.eclipse.org/legal/epl-2.0/. + +== Source Code + +https://github.com/openhab/openhab-addons diff --git a/bundles/org.openhab.voice.mimictts/README.md b/bundles/org.openhab.voice.mimictts/README.md new file mode 100644 index 000000000..265890c8d --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/README.md @@ -0,0 +1,49 @@ +# Mimic Text-to-Speech + +Mimic (version 3 and above) is an offline open source Text-To-speech engine designed by Mycroft A.I. for the eponym Vocal Assistant, that provides multiple voices, available in different languages and variants. + +Its neural network is built upon some very good and some not-so-good models, so try some to be sure you get the best one for your need. + +Mimic3 doesn't need Mycroft, and it can be run as a simple command line utility, or as a web server with an API. + +This TTS bundle makes use of this last feature, so please take note : this openHAB TTS bundle is NOT a standalone, and it requires the Mimic web server to run somewhere (on your openHAB computer, or your network). + +You can find more information about the Mimic web server, and how to install it, on the [official documentation](https://mycroft-ai.gitbook.io/docs/mycroft-technologies/mimic-tts/mimic-3#installation). + +It supports a subset of SSML, and if you want to use it, be sure to start your text with ``. + +## Configuration + +Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set: + +* **url** - Mimic URL. Default to `http://localhost:59125` +* **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower. +* **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models. +* **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models. + +In case you would like to setup the service via a text file, create a new file in `$OPENHAB_ROOT/conf/services` named `mimictts.cfg` + +Its contents should look similar to: + +``` +org.openhab.voice.mimictts:url=http://localhost:59125 +org.openhab.voice.mimictts:speakingRate=1 +org.openhab.voice.mimictts:audioVolatility=0.667 +org.openhab.voice.mimictts:phonemeVolatility=0.8 +``` + +### Default Text-to-Speech and Voice Configuration + +You can setup your preferred default Text-to-Speech and default voice in the UI: + +* Go to **Settings**. +* Edit **System Services - Voice**. +* Set **Mimic** as **Default Text-to-Speech**. +* Choose your preferred **Default Voice** for your setup. + +In case you would like to setup these settings via a text file, you can edit the file `runtime.cfg` in `$OPENHAB_ROOT/conf/services` and set the following entries: + +``` +org.openhab.voice:defaultTTS=mimictts +org.openhab.voice:defaultVoice=mimictts:fr_FR_siwis_low +``` diff --git a/bundles/org.openhab.voice.mimictts/pom.xml b/bundles/org.openhab.voice.mimictts/pom.xml new file mode 100644 index 000000000..84130a2c1 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/pom.xml @@ -0,0 +1,17 @@ + + + + 4.0.0 + + + org.openhab.addons.bundles + org.openhab.addons.reactor.bundles + 3.4.0-SNAPSHOT + + + org.openhab.voice.mimictts + + openHAB Add-ons :: Bundles :: Voice :: mimic Text-To-Speech + + diff --git a/bundles/org.openhab.voice.mimictts/src/main/feature/feature.xml b/bundles/org.openhab.voice.mimictts/src/main/feature/feature.xml new file mode 100644 index 000000000..41f308f2f --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/feature/feature.xml @@ -0,0 +1,9 @@ + + + mvn:org.openhab.core.features.karaf/org.openhab.core.features.karaf.openhab-core/${ohc.version}/xml/features + + + openhab-runtime-base + mvn:org.openhab.addons.bundles/org.openhab.voice.mimictts/${project.version} + + diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java new file mode 100644 index 000000000..4061a96e7 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java @@ -0,0 +1,28 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.mimic.internal; + +import org.eclipse.jdt.annotation.NonNullByDefault; + +/** + * The {@link MimicConfiguration} class contains fields mapping configuration parameters. + * + * @author Gwendal Roulleau - Initial contribution + */ +@NonNullByDefault +public class MimicConfiguration { + public String url = "http://localhost:59125"; + public Double speakingRate = 1.0; + public Double audioVolatility = 0.667; + public Double phonemeVolatility = 0.8; +} diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java new file mode 100644 index 000000000..b82befda8 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java @@ -0,0 +1,249 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.mimic.internal; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.eclipse.jdt.annotation.NonNullByDefault; +import org.eclipse.jdt.annotation.Nullable; +import org.openhab.core.audio.AudioFormat; +import org.openhab.core.audio.AudioStream; +import org.openhab.core.audio.ByteArrayAudioStream; +import org.openhab.core.config.core.ConfigurableService; +import org.openhab.core.io.net.http.HttpRequestBuilder; +import org.openhab.core.io.net.http.HttpUtil; +import org.openhab.core.library.types.RawType; +import org.openhab.core.voice.TTSException; +import org.openhab.core.voice.TTSService; +import org.openhab.core.voice.Voice; +import org.openhab.voice.mimic.internal.dto.VoiceDto; +import org.osgi.framework.Constants; +import org.osgi.service.component.annotations.Activate; +import org.osgi.service.component.annotations.Component; +import org.osgi.service.component.annotations.Modified; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.JsonSyntaxException; + +/** + * Mimic Voice service implementation. + * + * @author Gwendal Roulleau - Initial contribution + */ +@Component(configurationPid = MimicTTSService.SERVICE_PID, property = Constants.SERVICE_PID + "=" + + MimicTTSService.SERVICE_PID) +@ConfigurableService(category = MimicTTSService.SERVICE_CATEGORY, label = MimicTTSService.SERVICE_NAME + + " Text-to-Speech", description_uri = MimicTTSService.SERVICE_CATEGORY + ":" + MimicTTSService.SERVICE_ID) +@NonNullByDefault +public class MimicTTSService implements TTSService { + + static final String SERVICE_CATEGORY = "voice"; + static final String SERVICE_ID = "mimictts"; + static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID; + static final String SERVICE_NAME = "Mimic"; + + /** + * Configuration parameters + */ + private static final String PARAM_URL = "url"; + private static final String PARAM_SPEAKINGRATE = "speakingRate"; + private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility"; + private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility"; + + /** + * Url + */ + private static final String LIST_VOICES_URL = "/api/voices"; + private static final String SYNTHETIZE_URL = "/api/tts"; + + /** The only wave format supported */ + private static final AudioFormat AUDIO_FORMAT = new AudioFormat(AudioFormat.CONTAINER_WAVE, + AudioFormat.CODEC_PCM_SIGNED, false, 16, 52000, 22050L, 1); + + private Set availableVoices = new HashSet<>(); + + /** + * Logger. + */ + private final Logger logger = LoggerFactory.getLogger(MimicTTSService.class); + + private final MimicConfiguration config = new MimicConfiguration(); + + private final Gson gson = new GsonBuilder().create(); + + @Activate + protected void activate(Map config) { + updateConfig(config); + } + + /** + * Called by the framework when the configuration was updated. + * + * @param newConfig Updated configuration + */ + @Modified + private void updateConfig(Map newConfig) { + logger.debug("Updating configuration"); + + // client id + Object param = newConfig.get(PARAM_URL); + if (param == null) { + logger.warn("Missing URL to access Mimic TTS API. Using localhost"); + } else { + config.url = param.toString(); + } + + // audio volatility + try { + param = newConfig.get(PARAM_AUDIOVOLATITLITY); + if (param != null) { + config.audioVolatility = Double.parseDouble(param.toString()); + } + } catch (NumberFormatException e) { + logger.warn("Cannot parse audioVolatility parameter. Using default"); + } + + // phoneme volatility + try { + param = newConfig.get(PARAM_PHONEMEVOLATITLITY); + if (param != null) { + config.phonemeVolatility = Double.parseDouble(param.toString()); + } + } catch (NumberFormatException e) { + logger.warn("Cannot parse phonemeVolatility parameter. Using default"); + } + + // speakingRate + try { + param = newConfig.get(PARAM_SPEAKINGRATE); + if (param != null) { + config.speakingRate = Double.parseDouble(param.toString()); + } + } catch (NumberFormatException e) { + logger.warn("Cannot parse speakingRate parameter. Using default"); + } + + refreshVoices(); + } + + @Override + public String getId() { + return SERVICE_ID; + } + + @Override + public String getLabel(@Nullable Locale locale) { + return SERVICE_NAME; + } + + @Override + public Set getAvailableVoices() { + return availableVoices; + } + + public void refreshVoices() { + String url = config.url + LIST_VOICES_URL; + availableVoices.clear(); + try { + String responseVoices = HttpRequestBuilder.getFrom(url).getContentAsString(); + VoiceDto[] mimicVoiceResponse = gson.fromJson(responseVoices, VoiceDto[].class); + if (mimicVoiceResponse == null) { + logger.warn("Cannot get mimic voices from the URL {}", url); + return; + } else if (mimicVoiceResponse.length == 0) { + logger.debug("Voice set response from Mimic is empty ?!"); + return; + } + for (VoiceDto voiceDto : mimicVoiceResponse) { + if (voiceDto.speakers != null && voiceDto.speakers.size() > 0) { + for (String speaker : voiceDto.speakers) { + availableVoices.add(new MimicVoice(voiceDto.key, voiceDto.language, voiceDto.name, speaker)); + } + } else { + availableVoices.add(new MimicVoice(voiceDto.key, voiceDto.language, voiceDto.name, null)); + } + } + } catch (IOException | JsonSyntaxException e) { + logger.warn("Cannot get mimic voices from the URL {}, error {}", url, e.getMessage()); + } + } + + @Override + public Set getSupportedFormats() { + return Set. of(AUDIO_FORMAT); + } + + /** + * Checks parameters and calls the API to synthesize voice. + * + * @param text Input text. + * @param voice Selected voice. + * @param requestedFormat Format that is supported by the target sink as well. + * @return Output audio stream + * @throws TTSException in case the service is unavailable or a parameter is invalid. + */ + @Override + public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFormat) throws TTSException { + + if (!availableVoices.contains(voice)) { + // let a chance for the service to update : + refreshVoices(); + if (!availableVoices.contains(voice)) { + throw new TTSException("Voice " + voice.getUID() + " not available for MimicTTS"); + } + } + + logger.debug("Synthesize '{}' for voice '{}' in format {}", text, voice.getUID(), requestedFormat); + // Validate arguments + // trim text + String trimmedText = text.trim(); + if (trimmedText.isEmpty()) { + throw new TTSException("The passed text is empty"); + } + if (!AUDIO_FORMAT.isCompatible(requestedFormat)) { + throw new TTSException("The passed AudioFormat is unsupported"); + } + String encodedText; + try { + encodedText = URLEncoder.encode(text, StandardCharsets.UTF_8.toString()); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException("Cannot encode text in URL " + text); + } + + String ssml = ""; + if (text.startsWith("")) { + ssml = "&ssml=true"; + } + + // create the audio byte array for given text, locale, format + String urlTTS = config.url + SYNTHETIZE_URL + "?text=" + encodedText + "&voice=" + + ((MimicVoice) voice).getTechnicalName() + ssml + "&noiseScale=" + config.audioVolatility + "&noiseW=" + + config.phonemeVolatility + "&lengthScale=" + config.speakingRate + "&audioTarget=client"; + logger.debug("Querying mimic with URL {}", urlTTS); + RawType responseWav = HttpUtil.downloadData(urlTTS, "audio/wav", false, -1); + if (responseWav == null) { + throw new TTSException("Cannot get wav from mimic url " + urlTTS); + } + return new ByteArrayAudioStream(responseWav.getBytes(), AUDIO_FORMAT); + } +} diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicVoice.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicVoice.java new file mode 100644 index 000000000..5ff482aa4 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicVoice.java @@ -0,0 +1,95 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.mimic.internal; + +import java.util.Locale; +import java.util.Objects; + +import org.eclipse.jdt.annotation.NonNullByDefault; +import org.eclipse.jdt.annotation.Nullable; +import org.openhab.core.voice.Voice; + +/** + * Mimic Voice representation. + * + * @author Gwendal Roulleau - Initial contribution + */ +@NonNullByDefault +public class MimicVoice implements Voice { + + @Nullable + private String speaker; + + private final Locale locale; + + private final String key; + + private final String name; + + public MimicVoice(String key, String language, String name, @Nullable String speaker) { + this.key = key; + this.locale = Locale.forLanguageTag(language.replaceAll("_", "-")); + this.name = name; + this.speaker = speaker; + } + + /** + * Globally unique identifier of the voice. + * + * @return A String uniquely identifying the voice globally + */ + @Override + public String getUID() { + return "mimictts:" + getTechnicalName().replaceAll("[^a-zA-Z0-9_]", "_"); + } + + /** + * Technical name of the voice. + * + * @return A String voice technical name + */ + public String getTechnicalName() { + String speakerId = (speaker != null) ? "#" + speaker : ""; + return (key + speakerId); + } + + @Override + public String getLabel() { + return name + ((speaker != null) ? " (" + speaker + ")" : ""); + } + + @Override + public Locale getLocale() { + return locale; + } + + @Override + public int hashCode() { + return Objects.hash(getTechnicalName()); + } + + @Override + public boolean equals(@Nullable Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + MimicVoice other = (MimicVoice) obj; + return Objects.equals(getTechnicalName(), other.getTechnicalName()); + } +} diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/dto/VoiceDto.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/dto/VoiceDto.java new file mode 100644 index 000000000..a9dedf206 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/dto/VoiceDto.java @@ -0,0 +1,32 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.mimic.internal.dto; + +import java.util.ArrayList; +import java.util.List; + +import org.eclipse.jdt.annotation.NonNullByDefault; + +/** + * Mimic Voice DTO. + * + * @author Gwendal Roulleau - Initial contribution + */ +@NonNullByDefault +public class VoiceDto { + + public String key = "UNDEFINED"; + public String language = "UNDEFINED"; + public String name = "UNDEFINED"; + public List speakers = new ArrayList<>(); +} diff --git a/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml new file mode 100644 index 000000000..ab8619a39 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml @@ -0,0 +1,37 @@ + + + + + + + Mimic 3 URL. + http://localhost:59125 + + + + Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less + than 1 is faster, and more than 1 is slower. + 1 + + + + The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice + model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models. + + 0.667 + + + + The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with + a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme + variability than single speaker models. + + 0.8 + + + + diff --git a/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties new file mode 100644 index 000000000..f34268e20 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties @@ -0,0 +1,11 @@ +voice.config.mimictts.audioVolatility.label = Audio Volatility +voice.config.mimictts.audioVolatility.description = The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models. +voice.config.mimictts.phonemeVolatility.label = Phoneme Volatility +voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models. +voice.config.mimictts.speakingRate.label = Speaking Rate +voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower. +voice.config.mimictts.url.label = URL +voice.config.mimictts.url.description = Mimic 3 URL. + +# service +service.voice.mimictts.label = Mimic Text-to-Speech diff --git a/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts_fr.properties b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts_fr.properties new file mode 100644 index 000000000..53401f653 --- /dev/null +++ b/bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts_fr.properties @@ -0,0 +1,11 @@ +voice.config.mimictts.audioVolatility.label = Volatilité Audio +voice.config.mimictts.audioVolatility.description = Quantité de bruit ajouté à l'audio généré (0-1). Peut aider à masquer les artefacts du modèle de voix. Les modèles à plusieurs élocuteurs sonnent mieux avec une quantité réduite de bruit, contrairement aux modèles à élocuteur unique. +voice.config.mimictts.phonemeVolatility.label = Volatilité Phonème +voice.config.mimictts.phonemeVolatility.description = La quantité de bruit utilisé pour générer les durées des phonèmes (0-1). Permet une cadence d'élocution variable, avec une valeur proche de 1 signifiant plus de variabilité. Les modèles à plusieurs élocuteurs sonnent mieux avec une quantité réduite de bruit, contrairement aux modèles à élocuteur unique. +voice.config.mimictts.speakingRate.label = Vitesse d'élocution +voice.config.mimictts.speakingRate.description = Controle la vitesse d'élocution. Une valeur de 1 correspond à la vitesse d'entrainement du jeu de données. Inférieur à 1 est plus rapide, et supérieur à 1 est plus lent. +voice.config.mimictts.url.label = URL +voice.config.mimictts.url.description = L'URL pour joindre l'API de Mimic. + +# service +service.voice.mimictts.label = Synthèse vocale Mimic diff --git a/bundles/pom.xml b/bundles/pom.xml index 21155cccc..5ec620b44 100644 --- a/bundles/pom.xml +++ b/bundles/pom.xml @@ -410,6 +410,7 @@ org.openhab.voice.googletts org.openhab.voice.mactts org.openhab.voice.marytts + org.openhab.voice.mimictts org.openhab.voice.actiontemplatehli org.openhab.voice.picotts org.openhab.voice.pollytts