diff --git a/backend/backend.proto b/backend/backend.proto index 43b6abe6c69f..e16c84fdbe7d 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -350,6 +350,12 @@ message TranscriptStreamResponse { TranscriptResult final_result = 2; } +message TranscriptWord { + int64 start = 1; + int64 end = 2; + string text = 3; +} + message TranscriptSegment { int32 id = 1; int64 start = 2; @@ -357,6 +363,7 @@ message TranscriptSegment { string text = 4; repeated int32 tokens = 5; string speaker = 6; + repeated TranscriptWord words = 7; } message GenerateImageRequest { diff --git a/backend/python/faster-whisper/backend.py b/backend/python/faster-whisper/backend.py index 9cfb9dfd1c80..551f2fac148b 100755 --- a/backend/python/faster-whisper/backend.py +++ b/backend/python/faster-whisper/backend.py @@ -55,11 +55,28 @@ def AudioTranscription(self, request, context): resultSegments = [] text = "" try: - segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False) + word_timestamps = "word" in request.timestamp_granularities + segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False, word_timestamps=word_timestamps) id = 0 for segment in segments: print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) - resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=int(segment.start)*1e9, end=int(segment.end)*1e9, text=segment.text)) + + words = [] + if word_timestamps and hasattr(segment, 'words'): + for word in segment.words: + words.append(backend_pb2.TranscriptWord( + start=int(word.start * 1e9), + end=int(word.end * 1e9), + text=word.word + )) + + resultSegments.append(backend_pb2.TranscriptSegment( + id=id, + start=int(segment.start * 1e9), + end=int(segment.end * 1e9), + text=segment.text, + words=words + )) text += segment.text id += 1 except Exception as err: diff --git a/core/backend/transcript.go b/core/backend/transcript.go index c3bfb77b4dfa..6d0b4c63d849 100644 --- a/core/backend/transcript.go +++ b/core/backend/transcript.go @@ -179,11 +179,22 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR Language: r.Language, Duration: float64(r.Duration), } + for _, s := range r.Segments { var tks []int for _, t := range s.Tokens { tks = append(tks, int(t)) } + var words []schema.TranscriptionWord + for _, w := range s.Words { + var word = schema.TranscriptionWord { + Start: time.Duration(w.Start), + End: time.Duration(w.End), + Text: w.Text, + } + words = append(words, word) + tr.Words = append(tr.Words, word) + } tr.Segments = append(tr.Segments, schema.TranscriptionSegment{ Text: s.Text, @@ -192,6 +203,7 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR End: time.Duration(s.End), Tokens: tks, Speaker: s.Speaker, + Words: words, }) } return tr diff --git a/core/cli/transcript.go b/core/cli/transcript.go index e4b7ff5856ed..d62beadf0776 100644 --- a/core/cli/transcript.go +++ b/core/cli/transcript.go @@ -81,14 +81,48 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error { fmt.Println(schema.TranscriptionResponse(tr, t.ResponseFormat)) case schema.TranscriptionResponseFormatJson: tr.Segments = nil + tr.Words = nil fallthrough case schema.TranscriptionResponseFormatJsonVerbose: + trs := schema.TranscriptionResultSeconds{ + Text: tr.Text, + Language: tr.Language, + Duration: tr.Duration, + Words: []schema.TranscriptionWordSeconds{}, + Segments: []schema.TranscriptionSegmentSeconds{}, + } + for _, word := range(tr.Words) { + trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{ + Start: word.Start.Seconds(), + End: word.End.Seconds(), + Text: word.Text, + }) + } + for _, seg := range(tr.Segments) { + segWords := []schema.TranscriptionWordSeconds{} + for _, word := range(seg.Words) { + segWords = append(segWords, schema.TranscriptionWordSeconds{ + Start: word.Start.Seconds(), + End: word.End.Seconds(), + Text: word.Text, + }) + } + trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{ + Id: seg.Id, + Start: seg.Start.Seconds(), + End: seg.End.Seconds(), + Text: seg.Text, + Tokens: seg.Tokens, + Speaker: seg.Speaker, + Words: segWords, + }) + } var mtr []byte var err error if t.PrettyPrint { - mtr, err = json.MarshalIndent(tr, "", " ") + mtr, err = json.MarshalIndent(trs, "", " ") } else { - mtr, err = json.Marshal(tr) + mtr, err = json.Marshal(trs) } if err != nil { return err diff --git a/core/http/endpoints/openai/transcription.go b/core/http/endpoints/openai/transcription.go index cf18e8244175..2979ecd59809 100644 --- a/core/http/endpoints/openai/transcription.go +++ b/core/http/endpoints/openai/transcription.go @@ -138,9 +138,43 @@ func TranscriptEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app return c.String(http.StatusOK, schema.TranscriptionResponse(tr, responseFormat)) case schema.TranscriptionResponseFormatJson: tr.Segments = nil + tr.Words = nil fallthrough case schema.TranscriptionResponseFormatJsonVerbose, "": // maintain backwards compatibility - return c.JSON(http.StatusOK, tr) + trs := schema.TranscriptionResultSeconds{ + Text: tr.Text, + Language: tr.Language, + Duration: tr.Duration, + Words: []schema.TranscriptionWordSeconds{}, + Segments: []schema.TranscriptionSegmentSeconds{}, + } + for _, word := range(tr.Words) { + trs.Words = append(trs.Words, schema.TranscriptionWordSeconds{ + Start: word.Start.Seconds(), + End: word.End.Seconds(), + Text: word.Text, + }) + } + for _, seg := range(tr.Segments) { + segWords := []schema.TranscriptionWordSeconds{} + for _, word := range(seg.Words) { + segWords = append(segWords, schema.TranscriptionWordSeconds{ + Start: word.Start.Seconds(), + End: word.End.Seconds(), + Text: word.Text, + }) + } + trs.Segments = append(trs.Segments, schema.TranscriptionSegmentSeconds{ + Id: seg.Id, + Start: seg.Start.Seconds(), + End: seg.End.Seconds(), + Text: seg.Text, + Tokens: seg.Tokens, + Speaker: seg.Speaker, + Words: segWords, + }) + } + return c.JSON(http.StatusOK, trs) default: return errors.New("invalid response_format") } diff --git a/core/schema/transcription.go b/core/schema/transcription.go index b0d3a8eb3abe..747adac94ae6 100644 --- a/core/schema/transcription.go +++ b/core/schema/transcription.go @@ -3,17 +3,49 @@ package schema import "time" type TranscriptionSegment struct { - Id int `json:"id"` - Start time.Duration `json:"start"` - End time.Duration `json:"end"` - Text string `json:"text"` - Tokens []int `json:"tokens"` - Speaker string `json:"speaker,omitempty"` + Id int `json:"id"` + Start time.Duration `json:"start"` + End time.Duration `json:"end"` + Text string `json:"text"` + Tokens []int `json:"tokens"` + Speaker string `json:"speaker,omitempty"` + Words []TranscriptionWord `json:"words,omitempty"` +} + +type TranscriptionWord struct { + Start time.Duration `json:"start"` + End time.Duration `json:"end"` + Text string `json:"text"` } type TranscriptionResult struct { Segments []TranscriptionSegment `json:"segments,omitempty"` + Words []TranscriptionWord `json:"words,omitempty"` Text string `json:"text"` Language string `json:"language,omitempty"` Duration float64 `json:"duration,omitempty"` } + +type TranscriptionSegmentSeconds struct { + Id int `json:"id"` + Start float64 `json:"start"` + End float64 `json:"end"` + Text string `json:"text"` + Tokens []int `json:"tokens"` + Speaker string `json:"speaker,omitempty"` + Words []TranscriptionWordSeconds `json:"words,omitempty"` +} + +type TranscriptionWordSeconds struct { + Start float64 `json:"start"` + End float64 `json:"end"` + Text string `json:"text"` +} + +type TranscriptionResultSeconds struct { + Segments []TranscriptionSegmentSeconds `json:"segments,omitempty"` + Words []TranscriptionWordSeconds `json:"words,omitempty"` + Text string `json:"text"` + Language string `json:"language,omitempty"` + Duration float64 `json:"duration,omitempty"` +}