1// Copyright 2023 Google LLC 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.cloud.dialogflow.v2; 18 19import "google/api/field_behavior.proto"; 20import "google/api/resource.proto"; 21import "google/protobuf/duration.proto"; 22 23option cc_enable_arenas = true; 24option csharp_namespace = "Google.Cloud.Dialogflow.V2"; 25option go_package = "cloud.google.com/go/dialogflow/apiv2/dialogflowpb;dialogflowpb"; 26option java_multiple_files = true; 27option java_outer_classname = "AudioConfigProto"; 28option java_package = "com.google.cloud.dialogflow.v2"; 29option objc_class_prefix = "DF"; 30option (google.api.resource_definition) = { 31 type: "automl.googleapis.com/Model" 32 pattern: "projects/{project}/locations/{location}/models/{model}" 33}; 34option (google.api.resource_definition) = { 35 type: "speech.googleapis.com/PhraseSet" 36 pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}" 37}; 38 39// Audio encoding of the audio content sent in the conversational query request. 40// Refer to the 41// [Cloud Speech API 42// documentation](https://cloud.google.com/speech-to-text/docs/basics) for more 43// details. 44enum AudioEncoding { 45 // Not specified. 46 AUDIO_ENCODING_UNSPECIFIED = 0; 47 48 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 49 AUDIO_ENCODING_LINEAR_16 = 1; 50 51 // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio 52 // Codec) is the recommended encoding because it is lossless (therefore 53 // recognition is not compromised) and requires only about half the 54 // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and 55 // 24-bit samples, however, not all fields in `STREAMINFO` are supported. 56 AUDIO_ENCODING_FLAC = 2; 57 58 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 59 AUDIO_ENCODING_MULAW = 3; 60 61 // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. 62 AUDIO_ENCODING_AMR = 4; 63 64 // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. 65 AUDIO_ENCODING_AMR_WB = 5; 66 67 // Opus encoded audio frames in Ogg container 68 // ([OggOpus](https://wiki.xiph.org/OggOpus)). 69 // `sample_rate_hertz` must be 16000. 70 AUDIO_ENCODING_OGG_OPUS = 6; 71 72 // Although the use of lossy encodings is not recommended, if a very low 73 // bitrate encoding is required, `OGG_OPUS` is highly preferred over 74 // Speex encoding. The [Speex](https://speex.org/) encoding supported by 75 // Dialogflow API has a header byte in each block, as in MIME type 76 // `audio/x-speex-with-header-byte`. 77 // It is a variant of the RTP Speex encoding defined in 78 // [RFC 5574](https://tools.ietf.org/html/rfc5574). 79 // The stream is a sequence of blocks, one block per RTP packet. Each block 80 // starts with a byte containing the length of the block, in bytes, followed 81 // by one or more frames of Speex data, padded to an integral number of 82 // bytes (octets) as specified in RFC 5574. In other words, each RTP header 83 // is replaced with a single byte containing the block length. Only Speex 84 // wideband is supported. `sample_rate_hertz` must be 16000. 85 AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7; 86} 87 88// Hints for the speech recognizer to help with recognition in a specific 89// conversation state. 90message SpeechContext { 91 // Optional. A list of strings containing words and phrases that the speech 92 // recognizer should recognize with higher likelihood. 93 // 94 // This list can be used to: 95 // 96 // * improve accuracy for words and phrases you expect the user to say, 97 // e.g. typical commands for your Dialogflow agent 98 // * add additional words to the speech recognizer vocabulary 99 // * ... 100 // 101 // See the [Cloud Speech 102 // documentation](https://cloud.google.com/speech-to-text/quotas) for usage 103 // limits. 104 repeated string phrases = 1; 105 106 // Optional. Boost for this context compared to other contexts: 107 // 108 // * If the boost is positive, Dialogflow will increase the probability that 109 // the phrases in this context are recognized over similar sounding phrases. 110 // * If the boost is unspecified or non-positive, Dialogflow will not apply 111 // any boost. 112 // 113 // Dialogflow recommends that you use boosts in the range (0, 20] and that you 114 // find a value that fits your use case with binary search. 115 float boost = 2; 116} 117 118// Variant of the specified [Speech 119// model][google.cloud.dialogflow.v2.InputAudioConfig.model] to use. 120// 121// See the [Cloud Speech 122// documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) 123// for which models have different variants. For example, the "phone_call" model 124// has both a standard and an enhanced variant. When you use an enhanced model, 125// you will generally receive higher quality results than for a standard model. 126enum SpeechModelVariant { 127 // No model variant specified. In this case Dialogflow defaults to 128 // USE_BEST_AVAILABLE. 129 SPEECH_MODEL_VARIANT_UNSPECIFIED = 0; 130 131 // Use the best available variant of the [Speech 132 // model][InputAudioConfig.model] that the caller is eligible for. 133 // 134 // Please see the [Dialogflow 135 // docs](https://cloud.google.com/dialogflow/docs/data-logging) for 136 // how to make your project eligible for enhanced models. 137 USE_BEST_AVAILABLE = 1; 138 139 // Use standard model variant even if an enhanced model is available. See the 140 // [Cloud Speech 141 // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) 142 // for details about enhanced models. 143 USE_STANDARD = 2; 144 145 // Use an enhanced model variant: 146 // 147 // * If an enhanced variant does not exist for the given 148 // [model][google.cloud.dialogflow.v2.InputAudioConfig.model] and request 149 // language, Dialogflow falls back to the standard variant. 150 // 151 // The [Cloud Speech 152 // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) 153 // describes which models have enhanced variants. 154 // 155 // * If the API caller isn't eligible for enhanced models, Dialogflow returns 156 // an error. Please see the [Dialogflow 157 // docs](https://cloud.google.com/dialogflow/docs/data-logging) 158 // for how to make your project eligible. 159 USE_ENHANCED = 3; 160} 161 162// Information for a word recognized by the speech recognizer. 163message SpeechWordInfo { 164 // The word this info is for. 165 string word = 3; 166 167 // Time offset relative to the beginning of the audio that corresponds to the 168 // start of the spoken word. This is an experimental feature and the accuracy 169 // of the time offset can vary. 170 google.protobuf.Duration start_offset = 1; 171 172 // Time offset relative to the beginning of the audio that corresponds to the 173 // end of the spoken word. This is an experimental feature and the accuracy of 174 // the time offset can vary. 175 google.protobuf.Duration end_offset = 2; 176 177 // The Speech confidence between 0.0 and 1.0 for this word. A higher number 178 // indicates an estimated greater likelihood that the recognized word is 179 // correct. The default of 0.0 is a sentinel value indicating that confidence 180 // was not set. 181 // 182 // This field is not guaranteed to be fully stable over time for the same 183 // audio input. Users should also not rely on it to always be provided. 184 float confidence = 4; 185} 186 187// Instructs the speech recognizer how to process the audio content. 188message InputAudioConfig { 189 // Required. Audio encoding of the audio content to process. 190 AudioEncoding audio_encoding = 1; 191 192 // Required. Sample rate (in Hertz) of the audio content sent in the query. 193 // Refer to 194 // [Cloud Speech API 195 // documentation](https://cloud.google.com/speech-to-text/docs/basics) for 196 // more details. 197 int32 sample_rate_hertz = 2; 198 199 // Required. The language of the supplied audio. Dialogflow does not do 200 // translations. See [Language 201 // Support](https://cloud.google.com/dialogflow/docs/reference/language) 202 // for a list of the currently supported language codes. Note that queries in 203 // the same session do not necessarily need to specify the same language. 204 string language_code = 3; 205 206 // If `true`, Dialogflow returns 207 // [SpeechWordInfo][google.cloud.dialogflow.v2.SpeechWordInfo] in 208 // [StreamingRecognitionResult][google.cloud.dialogflow.v2.StreamingRecognitionResult] 209 // with information about the recognized speech words, e.g. start and end time 210 // offsets. If false or unspecified, Speech doesn't return any word-level 211 // information. 212 bool enable_word_info = 13; 213 214 // A list of strings containing words and phrases that the speech 215 // recognizer should recognize with higher likelihood. 216 // 217 // See [the Cloud Speech 218 // documentation](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints) 219 // for more details. 220 // 221 // This field is deprecated. Please use [speech_contexts]() instead. If you 222 // specify both [phrase_hints]() and [speech_contexts](), Dialogflow will 223 // treat the [phrase_hints]() as a single additional [SpeechContext](). 224 repeated string phrase_hints = 4 [deprecated = true]; 225 226 // Context information to assist speech recognition. 227 // 228 // See [the Cloud Speech 229 // documentation](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints) 230 // for more details. 231 repeated SpeechContext speech_contexts = 11; 232 233 // Which Speech model to select for the given request. Select the 234 // model best suited to your domain to get best results. If a model is not 235 // explicitly specified, then we auto-select a model based on the parameters 236 // in the InputAudioConfig. 237 // If enhanced speech model is enabled for the agent and an enhanced 238 // version of the specified model for the language does not exist, then the 239 // speech is recognized using the standard version of the specified model. 240 // Refer to 241 // [Cloud Speech API 242 // documentation](https://cloud.google.com/speech-to-text/docs/basics#select-model) 243 // for more details. 244 // If you specify a model, the following models typically have the best 245 // performance: 246 // 247 // - phone_call (best for Agent Assist and telephony) 248 // - latest_short (best for Dialogflow non-telephony) 249 // - command_and_search (best for very short utterances and commands) 250 string model = 7; 251 252 // Which variant of the [Speech 253 // model][google.cloud.dialogflow.v2.InputAudioConfig.model] to use. 254 SpeechModelVariant model_variant = 10; 255 256 // If `false` (default), recognition does not cease until the 257 // client closes the stream. 258 // If `true`, the recognizer will detect a single spoken utterance in input 259 // audio. Recognition ceases when it detects the audio's voice has 260 // stopped or paused. In this case, once a detected intent is received, the 261 // client should close the stream and start a new request with a new stream as 262 // needed. 263 // Note: This setting is relevant only for streaming methods. 264 // Note: When specified, InputAudioConfig.single_utterance takes precedence 265 // over StreamingDetectIntentRequest.single_utterance. 266 bool single_utterance = 8; 267 268 // Only used in 269 // [Participants.AnalyzeContent][google.cloud.dialogflow.v2.Participants.AnalyzeContent] 270 // and 271 // [Participants.StreamingAnalyzeContent][google.cloud.dialogflow.v2.Participants.StreamingAnalyzeContent]. 272 // If `false` and recognition doesn't return any result, trigger 273 // `NO_SPEECH_RECOGNIZED` event to Dialogflow agent. 274 bool disable_no_speech_recognized_event = 14; 275 276 // Enable automatic punctuation option at the speech backend. 277 bool enable_automatic_punctuation = 17; 278} 279 280// Gender of the voice as described in 281// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). 282enum SsmlVoiceGender { 283 // An unspecified gender, which means that the client doesn't care which 284 // gender the selected voice will have. 285 SSML_VOICE_GENDER_UNSPECIFIED = 0; 286 287 // A male voice. 288 SSML_VOICE_GENDER_MALE = 1; 289 290 // A female voice. 291 SSML_VOICE_GENDER_FEMALE = 2; 292 293 // A gender-neutral voice. 294 SSML_VOICE_GENDER_NEUTRAL = 3; 295} 296 297// Description of which voice to use for speech synthesis. 298message VoiceSelectionParams { 299 // Optional. The name of the voice. If not set, the service will choose a 300 // voice based on the other parameters such as language_code and 301 // [ssml_gender][google.cloud.dialogflow.v2.VoiceSelectionParams.ssml_gender]. 302 string name = 1; 303 304 // Optional. The preferred gender of the voice. If not set, the service will 305 // choose a voice based on the other parameters such as language_code and 306 // [name][google.cloud.dialogflow.v2.VoiceSelectionParams.name]. Note that 307 // this is only a preference, not requirement. If a voice of the appropriate 308 // gender is not available, the synthesizer should substitute a voice with a 309 // different gender rather than failing the request. 310 SsmlVoiceGender ssml_gender = 2; 311} 312 313// Configuration of how speech should be synthesized. 314message SynthesizeSpeechConfig { 315 // Optional. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is the normal 316 // native speed supported by the specific voice. 2.0 is twice as fast, and 317 // 0.5 is half as fast. If unset(0.0), defaults to the native 1.0 speed. Any 318 // other values < 0.25 or > 4.0 will return an error. 319 double speaking_rate = 1; 320 321 // Optional. Speaking pitch, in the range [-20.0, 20.0]. 20 means increase 20 322 // semitones from the original pitch. -20 means decrease 20 semitones from the 323 // original pitch. 324 double pitch = 2; 325 326 // Optional. Volume gain (in dB) of the normal native volume supported by the 327 // specific voice, in the range [-96.0, 16.0]. If unset, or set to a value of 328 // 0.0 (dB), will play at normal native signal amplitude. A value of -6.0 (dB) 329 // will play at approximately half the amplitude of the normal native signal 330 // amplitude. A value of +6.0 (dB) will play at approximately twice the 331 // amplitude of the normal native signal amplitude. We strongly recommend not 332 // to exceed +10 (dB) as there's usually no effective increase in loudness for 333 // any value greater than that. 334 double volume_gain_db = 3; 335 336 // Optional. An identifier which selects 'audio effects' profiles that are 337 // applied on (post synthesized) text to speech. Effects are applied on top of 338 // each other in the order they are given. 339 repeated string effects_profile_id = 5; 340 341 // Optional. The desired voice of the synthesized audio. 342 VoiceSelectionParams voice = 4; 343} 344 345// Audio encoding of the output audio format in Text-To-Speech. 346enum OutputAudioEncoding { 347 // Not specified. 348 OUTPUT_AUDIO_ENCODING_UNSPECIFIED = 0; 349 350 // Uncompressed 16-bit signed little-endian samples (Linear PCM). 351 // Audio content returned as LINEAR16 also contains a WAV header. 352 OUTPUT_AUDIO_ENCODING_LINEAR_16 = 1; 353 354 // MP3 audio at 32kbps. 355 OUTPUT_AUDIO_ENCODING_MP3 = 2; 356 357 // MP3 audio at 64kbps. 358 OUTPUT_AUDIO_ENCODING_MP3_64_KBPS = 4; 359 360 // Opus encoded audio wrapped in an ogg container. The result will be a 361 // file which can be played natively on Android, and in browsers (at least 362 // Chrome and Firefox). The quality of the encoding is considerably higher 363 // than MP3 while using approximately the same bitrate. 364 OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3; 365 366 // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. 367 OUTPUT_AUDIO_ENCODING_MULAW = 5; 368} 369 370// Instructs the speech synthesizer on how to generate the output audio content. 371// If this audio config is supplied in a request, it overrides all existing 372// text-to-speech settings applied to the agent. 373message OutputAudioConfig { 374 // Required. Audio encoding of the synthesized audio content. 375 OutputAudioEncoding audio_encoding = 1 376 [(google.api.field_behavior) = REQUIRED]; 377 378 // The synthesis sample rate (in hertz) for this audio. If not 379 // provided, then the synthesizer will use the default sample rate based on 380 // the audio encoding. If this is different from the voice's natural sample 381 // rate, then the synthesizer will honor this request by converting to the 382 // desired sample rate (which might result in worse audio quality). 383 int32 sample_rate_hertz = 2; 384 385 // Configuration of how speech should be synthesized. 386 SynthesizeSpeechConfig synthesize_speech_config = 3; 387} 388 389// [DTMF](https://en.wikipedia.org/wiki/Dual-tone_multi-frequency_signaling) 390// digit in Telephony Gateway. 391enum TelephonyDtmf { 392 // Not specified. This value may be used to indicate an absent digit. 393 TELEPHONY_DTMF_UNSPECIFIED = 0; 394 395 // Number: '1'. 396 DTMF_ONE = 1; 397 398 // Number: '2'. 399 DTMF_TWO = 2; 400 401 // Number: '3'. 402 DTMF_THREE = 3; 403 404 // Number: '4'. 405 DTMF_FOUR = 4; 406 407 // Number: '5'. 408 DTMF_FIVE = 5; 409 410 // Number: '6'. 411 DTMF_SIX = 6; 412 413 // Number: '7'. 414 DTMF_SEVEN = 7; 415 416 // Number: '8'. 417 DTMF_EIGHT = 8; 418 419 // Number: '9'. 420 DTMF_NINE = 9; 421 422 // Number: '0'. 423 DTMF_ZERO = 10; 424 425 // Letter: 'A'. 426 DTMF_A = 11; 427 428 // Letter: 'B'. 429 DTMF_B = 12; 430 431 // Letter: 'C'. 432 DTMF_C = 13; 433 434 // Letter: 'D'. 435 DTMF_D = 14; 436 437 // Asterisk/star: '*'. 438 DTMF_STAR = 15; 439 440 // Pound/diamond/hash/square/gate/octothorpe: '#'. 441 DTMF_POUND = 16; 442} 443 444// A wrapper of repeated TelephonyDtmf digits. 445message TelephonyDtmfEvents { 446 // A sequence of TelephonyDtmf digits. 447 repeated TelephonyDtmf dtmf_events = 1; 448} 449 450// Configures speech transcription for 451// [ConversationProfile][google.cloud.dialogflow.v2.ConversationProfile]. 452message SpeechToTextConfig { 453 // The speech model used in speech to text. 454 // `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as 455 // `USE_ENHANCED`. It can be overridden in 456 // [AnalyzeContentRequest][google.cloud.dialogflow.v2.AnalyzeContentRequest] 457 // and 458 // [StreamingAnalyzeContentRequest][google.cloud.dialogflow.v2.StreamingAnalyzeContentRequest] 459 // request. If enhanced model variant is specified and an enhanced version of 460 // the specified model for the language does not exist, then it would emit an 461 // error. 462 SpeechModelVariant speech_model_variant = 1; 463 464 // Which Speech model to select. Select the model best suited to your domain 465 // to get best results. If a model is not explicitly specified, then a default 466 // model is used. 467 // Refer to 468 // [Cloud Speech API 469 // documentation](https://cloud.google.com/speech-to-text/docs/basics#select-model) 470 // for more details. 471 string model = 2; 472} 473