mitm/asr.go

135 lines
3.7 KiB
Go

package main
import (
"bytes"
"encoding/json"
"fmt"
"math/rand"
"net/http"
)
// WordInfo represents individual word information with timing and confidence
type WordInfo struct {
Confidence float64 `json:"confidence"`
EndTime int64 `json:"end_time"`
StartTime int64 `json:"start_time"`
Text string `json:"text"`
}
// Result represents the speech recognition result
type Result struct {
Language string `json:"language"`
LanguageDetails string `json:"language_details"`
Volume string `json:"volume"`
Source int `json:"source"`
AudioStreamOffset int64 `json:"audio_stream_offset"`
AudioStreamEndTime int64 `json:"audio_stream_end_time"`
BoardcastOffset int64 `json:"boardcast_offset"`
BoardcastEndTime int64 `json:"boardcast_end_time"`
WordInfo []WordInfo `json:"word_info"`
}
// SpeechRecognitionResponse represents the complete speech recognition response
type SpeechRecognitionResponse struct {
RoomID string `json:"room_id"`
UserID string `json:"user_id"`
StreamID string `json:"stream_id"`
Result Result `json:"result"`
ModelName string `json:"model_name"`
}
// ToJSON converts the struct to JSON byte array
func (s *SpeechRecognitionResponse) ToJSON() ([]byte, error) {
var buf bytes.Buffer
encoder := json.NewEncoder(&buf)
encoder.SetEscapeHTML(false)
err := encoder.Encode(s)
if err != nil {
return nil, fmt.Errorf("failed to marshal to JSON: %w", err)
}
data := buf.Bytes()
// Remove the trailing newline that Encode adds
if len(data) > 0 && data[len(data)-1] == '\n' {
data = data[:len(data)-1]
}
return data, nil
}
// FromJSON parses JSON string into the struct
func (s *SpeechRecognitionResponse) FromJSON(jsonStr string) error {
err := json.Unmarshal([]byte(jsonStr), s)
if err != nil {
return fmt.Errorf("failed to unmarshal JSON: %w", err)
}
return nil
}
func (s *SpeechRecognitionResponse) asrReplaceEmpty() {
s.Result.WordInfo = []WordInfo{}
}
func (s *SpeechRecognitionResponse) asrReplaceRandom() {
for i, word := range s.Result.WordInfo {
if word.Text == "" || word.Text == " " {
continue
}
// Check if text contains Chinese characters
containsChinese := false
for _, r := range word.Text {
if r >= 0x4e00 && r <= 0x9fff {
containsChinese = true
break
}
}
if containsChinese {
// Replace with random Chinese characters
runes := []rune(word.Text)
for i := range runes {
if runes[i] >= 0x4e00 && runes[i] <= 0x9fff {
// Generate random Chinese character in common range
runes[i] = rune(0x4e00 + rand.Intn(0x9fff-0x4e00+1))
}
}
s.Result.WordInfo[i].Text = string(runes)
} else {
// Replace with random English characters
runes := []rune(word.Text)
for i := range runes {
if (runes[i] >= 'a' && runes[i] <= 'z') || (runes[i] >= 'A' && runes[i] <= 'Z') {
if runes[i] >= 'a' && runes[i] <= 'z' {
runes[i] = rune('a' + rand.Intn(26))
} else {
runes[i] = rune('A' + rand.Intn(26))
}
}
}
s.Result.WordInfo[i].Text = string(runes)
}
}
}
func asrResultObfuscate(r *http.Request, body []byte) ([]byte, error) {
fmt.Printf("asrResultObfuscate: %s\n", r.URL.Path)
if r.URL.Path != "/webcast/review/client_ai/upload_asr_result/" {
return nil, fmt.Errorf("not an asr request")
}
obj := SpeechRecognitionResponse{}
err := obj.FromJSON(string(body))
if err != nil {
return nil, fmt.Errorf("failed to unmarshal JSON: %w", err)
}
if len(obj.Result.WordInfo) == 0 {
return nil, fmt.Errorf("no word info")
}
if rand.Intn(100) < 50 {
obj.asrReplaceEmpty()
} else {
obj.asrReplaceRandom()
}
jsonData, err := obj.ToJSON()
if err != nil {
return nil, fmt.Errorf("failed to marshal to JSON: %w", err)
}
return jsonData, nil
}