azure-rest-api-specs/specification/cognitiveservices/OpenAI.Inference/models/audio/commons.tsp at d33d47dc85ef1f2c29c3dd278ded3650df9e49d7 · Azure/azure-rest-api-specs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import "@azure-tools/typespec-azure-core";
import "@typespec/rest";
import "@typespec/http";

using TypeSpec.Rest;
using TypeSpec.Http;

namespace Azure.OpenAI;

@doc("Defines the format of the output.")
enum AudioTranscriptionFormat {
    @doc("Lorem ipsum")
    simpleJson: "json",

    @doc("Lorem ipsum")
    text: "text",

    @doc("Lorem ipsum")
    srt: "srt",

    @doc("Lorem ipsum")
    verboseJson: "verbose_json",

    @doc("Lorem ipsum")
    vtt: "vtt",
}

@doc("""
Transcription request.
Requesting format 'json' will result on only the 'text' field being set.
For more output data use 'verbose_json.
""")
model AudioTranscriptionOptionsCommon {
    @doc("""
    The content type for the operation. Always multipart/form-data for this operation.
    You need to set your content separtor with the boundary value hardcode here: "azure_openai"
    """)
    @header("content-type")
    contentType: "multipart/form-data; boundary=azure_openai";

    @doc("The content length of the operation. This needs to be provided by the caller.")
    @header("content-length")
    contentLength: int64;

    @doc("The audio file object to transcribe.")
    file: bytes;

    @doc("An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.")
    prompt?: string;

    @doc("""
    The sampling temperature, between 0 and 1.
    Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
    If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
    """)
    temperature?: float32 = 0;
}

@doc("Lorem ipsum")
model AudioTranscriptionSimpleJson {
    @doc("Transcribed text.")
    text: string;
}

@doc("Audio transcription task type")
enum AudioTranscriptionTask {
    @doc("Value for a transcription task")
    transcribe: "transcribe",

    @doc("Value for a translation task")
    translate: "translate",
}

@doc("Transcription segment.")
model AudioTranscriptionSegment {
    @doc("Segment identifier.")
    id: int32;

    @doc("Segment start offset.")
    start: float32;

    @doc("Segment end offset.")
    end: float32;

    @doc("Segment text.")
    text: string;

    @doc("Temperature.")
    temperature: float32;

    @doc("Average log probability.")
    @projectedName("json", "avg_logprob")
    averageLogProb: float32;

    @doc("Compression ratio.")
    @projectedName("json", "compression_ratio")
    compressionRatio: float32;

    @doc("Probability of 'no speech'.")
    @projectedName("json", "no_speech_prob")
    noSpeechProb: float32;

    @doc("Tokens in this segment")
    tokens: int32[];

    @doc("TODO")
    seek: int32;
}

@doc("Transcription response.")
model AudioTranscriptionVerboseJson extends AudioTranscriptionSimpleJson {
    @doc("Audio transcription task.")
    task: AudioTranscriptionTask;

    @doc("Language detected in the source audio file.")
    language: string;

    @doc("Duration.")
    @encode("seconds", float32)
    duration: duration;

    @doc("Segments.")
    segments: AudioTranscriptionSegment[];
}