Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -350,13 +350,20 @@ message TranscriptStreamResponse {
TranscriptResult final_result = 2;
}

message TranscriptWord {
int64 start = 1;
int64 end = 2;
string text = 3;
}

message TranscriptSegment {
int32 id = 1;
int64 start = 2;
int64 end = 3;
string text = 4;
repeated int32 tokens = 5;
string speaker = 6;
repeated TranscriptWord words = 7;
}

message GenerateImageRequest {
Expand Down
21 changes: 19 additions & 2 deletions backend/python/faster-whisper/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,28 @@ def AudioTranscription(self, request, context):
resultSegments = []
text = ""
try:
segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False)
word_timestamps = "word" in request.timestamp_granularities
segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False, word_timestamps=word_timestamps)
id = 0
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=int(segment.start)*1e9, end=int(segment.end)*1e9, text=segment.text))

words = []
if word_timestamps and hasattr(segment, 'words'):
for word in segment.words:
words.append(backend_pb2.TranscriptWord(
start=int(word.start * 1e9),
end=int(word.end * 1e9),
text=word.word
))

resultSegments.append(backend_pb2.TranscriptSegment(
id=id,
start=int(segment.start * 1e9),
end=int(segment.end * 1e9),
text=segment.text,
words=words
))
text += segment.text
id += 1
except Exception as err:
Expand Down
12 changes: 12 additions & 0 deletions core/backend/transcript.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,22 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
Language: r.Language,
Duration: float64(r.Duration),
}

for _, s := range r.Segments {
var tks []int
for _, t := range s.Tokens {
tks = append(tks, int(t))
}
var words []schema.TranscriptionWord
for _, w := range s.Words {
var word = schema.TranscriptionWord {
Start: time.Duration(w.Start),
End: time.Duration(w.End),
Text: w.Text,
}
words = append(words, word)
tr.Words = append(tr.Words, word)
}
tr.Segments = append(tr.Segments,
schema.TranscriptionSegment{
Text: s.Text,
Expand All @@ -192,6 +203,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
End: time.Duration(s.End),
Tokens: tks,
Speaker: s.Speaker,
Words: words,
})
}
return tr
Expand Down
38 changes: 36 additions & 2 deletions core/cli/transcript.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,48 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
fmt.Println(schema.TranscriptionResponse(tr, t.ResponseFormat))
case schema.TranscriptionResponseFormatJson:
tr.Segments = nil
tr.Words = nil
fallthrough
case schema.TranscriptionResponseFormatJsonVerbose:
trs := schema.TranscriptionResultSeconds{
Text: tr.Text,
Language: tr.Language,
Duration: tr.Duration,
Words: []schema.TranscriptionWordSeconds{},
Segments: []schema.TranscriptionSegmentSeconds{},
}
for _, word := range(tr.Words) {
trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
for _, seg := range(tr.Segments) {
segWords := []schema.TranscriptionWordSeconds{}
for _, word := range(seg.Words) {
segWords = append(segWords, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
Id: seg.Id,
Start: seg.Start.Seconds(),
End: seg.End.Seconds(),
Text: seg.Text,
Tokens: seg.Tokens,
Speaker: seg.Speaker,
Words: segWords,
})
}
var mtr []byte
var err error
if t.PrettyPrint {
mtr, err = json.MarshalIndent(tr, "", " ")
mtr, err = json.MarshalIndent(trs, "", " ")
} else {
mtr, err = json.Marshal(tr)
mtr, err = json.Marshal(trs)
}
if err != nil {
return err
Expand Down
36 changes: 35 additions & 1 deletion core/http/endpoints/openai/transcription.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,43 @@ func TranscriptEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
return c.String(http.StatusOK, schema.TranscriptionResponse(tr, responseFormat))
case schema.TranscriptionResponseFormatJson:
tr.Segments = nil
tr.Words = nil
fallthrough
case schema.TranscriptionResponseFormatJsonVerbose, "": // maintain backwards compatibility
return c.JSON(http.StatusOK, tr)
trs := schema.TranscriptionResultSeconds{
Text: tr.Text,
Language: tr.Language,
Duration: tr.Duration,
Words: []schema.TranscriptionWordSeconds{},
Segments: []schema.TranscriptionSegmentSeconds{},
}
for _, word := range(tr.Words) {
trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
for _, seg := range(tr.Segments) {
segWords := []schema.TranscriptionWordSeconds{}
for _, word := range(seg.Words) {
segWords = append(segWords, schema.TranscriptionWordSeconds{
Start: word.Start.Seconds(),
End: word.End.Seconds(),
Text: word.Text,
})
}
trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{
Id: seg.Id,
Start: seg.Start.Seconds(),
End: seg.End.Seconds(),
Text: seg.Text,
Tokens: seg.Tokens,
Speaker: seg.Speaker,
Words: segWords,
})
}
return c.JSON(http.StatusOK, trs)
default:
return errors.New("invalid response_format")
}
Expand Down
44 changes: 38 additions & 6 deletions core/schema/transcription.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,49 @@ package schema
import "time"

type TranscriptionSegment struct {
Id int `json:"id"`
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Speaker string `json:"speaker,omitempty"`
Id int `json:"id"`
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Speaker string `json:"speaker,omitempty"`
Words []TranscriptionWord `json:"words,omitempty"`
}

type TranscriptionWord struct {
Start time.Duration `json:"start"`
End time.Duration `json:"end"`
Text string `json:"text"`
}

type TranscriptionResult struct {
Segments []TranscriptionSegment `json:"segments,omitempty"`
Words []TranscriptionWord `json:"words,omitempty"`
Text string `json:"text"`
Language string `json:"language,omitempty"`
Duration float64 `json:"duration,omitempty"`
}

type TranscriptionSegmentSeconds struct {
Id int `json:"id"`
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Speaker string `json:"speaker,omitempty"`
Words []TranscriptionWordSeconds `json:"words,omitempty"`
}

type TranscriptionWordSeconds struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
}

type TranscriptionResultSeconds struct {
Segments []TranscriptionSegmentSeconds `json:"segments,omitempty"`
Words []TranscriptionWordSeconds `json:"words,omitempty"`
Text string `json:"text"`
Language string `json:"language,omitempty"`
Duration float64 `json:"duration,omitempty"`
}
Loading