@@ -27,6 +27,129 @@ option java_outer_classname = "AudioConfigProto";
2727option java_package = "com.google.cloud.dialogflow.v2" ;
2828option objc_class_prefix = "DF" ;
2929
30+ // Audio encoding of the audio content sent in the conversational query request.
31+ // Refer to the
32+ // [Cloud Speech API
33+ // documentation](https://cloud.google.com/speech-to-text/docs/basics) for more
34+ // details.
35+ enum AudioEncoding {
36+ // Not specified.
37+ AUDIO_ENCODING_UNSPECIFIED = 0 ;
38+
39+ // Uncompressed 16-bit signed little-endian samples (Linear PCM).
40+ AUDIO_ENCODING_LINEAR_16 = 1 ;
41+
42+ // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
43+ // Codec) is the recommended encoding because it is lossless (therefore
44+ // recognition is not compromised) and requires only about half the
45+ // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and
46+ // 24-bit samples, however, not all fields in `STREAMINFO` are supported.
47+ AUDIO_ENCODING_FLAC = 2 ;
48+
49+ // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
50+ AUDIO_ENCODING_MULAW = 3 ;
51+
52+ // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
53+ AUDIO_ENCODING_AMR = 4 ;
54+
55+ // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
56+ AUDIO_ENCODING_AMR_WB = 5 ;
57+
58+ // Opus encoded audio frames in Ogg container
59+ // ([OggOpus](https://wiki.xiph.org/OggOpus)).
60+ // `sample_rate_hertz` must be 16000.
61+ AUDIO_ENCODING_OGG_OPUS = 6 ;
62+
63+ // Although the use of lossy encodings is not recommended, if a very low
64+ // bitrate encoding is required, `OGG_OPUS` is highly preferred over
65+ // Speex encoding. The [Speex](https://speex.org/) encoding supported by
66+ // Dialogflow API has a header byte in each block, as in MIME type
67+ // `audio/x-speex-with-header-byte`.
68+ // It is a variant of the RTP Speex encoding defined in
69+ // [RFC 5574](https://tools.ietf.org/html/rfc5574).
70+ // The stream is a sequence of blocks, one block per RTP packet. Each block
71+ // starts with a byte containing the length of the block, in bytes, followed
72+ // by one or more frames of Speex data, padded to an integral number of
73+ // bytes (octets) as specified in RFC 5574. In other words, each RTP header
74+ // is replaced with a single byte containing the block length. Only Speex
75+ // wideband is supported. `sample_rate_hertz` must be 16000.
76+ AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7 ;
77+ }
78+
79+ // Variant of the specified [Speech model][google.cloud.dialogflow.v2.InputAudioConfig.model] to use.
80+ //
81+ // See the [Cloud Speech
82+ // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
83+ // for which models have different variants. For example, the "phone_call" model
84+ // has both a standard and an enhanced variant. When you use an enhanced model,
85+ // you will generally receive higher quality results than for a standard model.
86+ enum SpeechModelVariant {
87+ // No model variant specified. In this case Dialogflow defaults to
88+ // USE_BEST_AVAILABLE.
89+ SPEECH_MODEL_VARIANT_UNSPECIFIED = 0 ;
90+
91+ // Use the best available variant of the [Speech
92+ // model][InputAudioConfig.model] that the caller is eligible for.
93+ //
94+ // Please see the [Dialogflow
95+ // docs](https://cloud.google.com/dialogflow-enterprise/docs/data-logging) for
96+ // how to make your project eligible for enhanced models.
97+ USE_BEST_AVAILABLE = 1 ;
98+
99+ // Use standard model variant even if an enhanced model is available. See the
100+ // [Cloud Speech
101+ // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
102+ // for details about enhanced models.
103+ USE_STANDARD = 2 ;
104+
105+ // Use an enhanced model variant:
106+ //
107+ // * If an enhanced variant does not exist for the given
108+ // [model][google.cloud.dialogflow.v2.InputAudioConfig.model] and request language, Dialogflow falls
109+ // back to the standard variant.
110+ //
111+ // The [Cloud Speech
112+ // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models)
113+ // describes which models have enhanced variants.
114+ //
115+ // * If the API caller isn't eligible for enhanced models, Dialogflow returns
116+ // an error. Please see the [Dialogflow
117+ // docs](https://cloud.google.com/dialogflow-enterprise/docs/data-logging)
118+ // for how to make your project eligible.
119+ USE_ENHANCED = 3 ;
120+ }
121+
122+ // Instructs the speech recognizer how to process the audio content.
123+ message InputAudioConfig {
124+ // Required. Audio encoding of the audio content to process.
125+ AudioEncoding audio_encoding = 1 ;
126+
127+ // Required. Sample rate (in Hertz) of the audio content sent in the query.
128+ // Refer to
129+ // [Cloud Speech API
130+ // documentation](https://cloud.google.com/speech-to-text/docs/basics) for
131+ // more details.
132+ int32 sample_rate_hertz = 2 ;
133+
134+ // Required. The language of the supplied audio. Dialogflow does not do
135+ // translations. See [Language
136+ // Support](https://cloud.google.com/dialogflow-enterprise/docs/reference/language)
137+ // for a list of the currently supported language codes. Note that queries in
138+ // the same session do not necessarily need to specify the same language.
139+ string language_code = 3 ;
140+
141+ // Optional. The collection of phrase hints which are used to boost accuracy
142+ // of speech recognition.
143+ // Refer to
144+ // [Cloud Speech API
145+ // documentation](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints)
146+ // for more details.
147+ repeated string phrase_hints = 4 ;
148+
149+ // Optional. Which variant of the [Speech model][google.cloud.dialogflow.v2.InputAudioConfig.model] to use.
150+ SpeechModelVariant model_variant = 10 ;
151+ }
152+
30153// Gender of the voice as described in
31154// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
32155enum SsmlVoiceGender {
@@ -109,7 +232,7 @@ enum OutputAudioEncoding {
109232 OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3 ;
110233}
111234
112- // Instructs the speech synthesizer how to generate the output audio content.
235+ // Instructs the speech synthesizer on how to generate the output audio content.
113236message OutputAudioConfig {
114237 // Required. Audio encoding of the synthesized audio content.
115238 OutputAudioEncoding audio_encoding = 1 ;
0 commit comments