mitm/asr.go

149 lines
4.1 KiB
Go

package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"math/rand"
"net/http"
)
// Try to load the dictionary for better Chinese character replacement
var Dict *ChineseDict
// WordInfo represents individual word information with timing and confidence
type WordInfo struct {
Confidence float64 `json:"confidence"`
EndTime int64 `json:"end_time"`
StartTime int64 `json:"start_time"`
Text string `json:"text"`
}
// Result represents the speech recognition result
type Result struct {
Language string `json:"language"`
LanguageDetails string `json:"language_details"`
Volume string `json:"volume"`
Source int `json:"source"`
AudioStreamOffset int64 `json:"audio_stream_offset"`
AudioStreamEndTime int64 `json:"audio_stream_end_time"`
BoardcastOffset int64 `json:"boardcast_offset"`
BoardcastEndTime int64 `json:"boardcast_end_time"`
WordInfo []WordInfo `json:"word_info"`
}
// SpeechRecognitionResponse represents the complete speech recognition response
type SpeechRecognitionResponse struct {
RoomID string `json:"room_id"`
UserID string `json:"user_id"`
StreamID string `json:"stream_id"`
Result Result `json:"result"`
ModelName string `json:"model_name"`
}
// ToJSON converts the struct to JSON byte array
func (s *SpeechRecognitionResponse) ToJSON() ([]byte, error) {
var buf bytes.Buffer
encoder := json.NewEncoder(&buf)
encoder.SetEscapeHTML(false)
err := encoder.Encode(s)
if err != nil {
return nil, fmt.Errorf("failed to marshal to JSON: %w", err)
}
data := buf.Bytes()
// Remove the trailing newline that Encode adds
if len(data) > 0 && data[len(data)-1] == '\n' {
data = data[:len(data)-1]
}
return data, nil
}
// FromJSON parses JSON string into the struct
func (s *SpeechRecognitionResponse) FromJSON(jsonStr string) error {
err := json.Unmarshal([]byte(jsonStr), s)
if err != nil {
return fmt.Errorf("failed to unmarshal JSON: %w", err)
}
return nil
}
func (s *SpeechRecognitionResponse) asrReplaceRandom() {
for i, word := range s.Result.WordInfo {
if word.Text == "" || word.Text == " " {
continue
}
// Check if text contains Chinese characters
containsChinese := false
for _, r := range word.Text {
if r >= 0x4e00 && r <= 0x9fff {
containsChinese = true
break
}
}
if containsChinese {
// Replace with random Chinese characters from dictionary if available
runes := []rune(word.Text)
for j := range runes {
if runes[j] >= 0x4e00 && runes[j] <= 0x9fff {
if Dict != nil {
// Use dictionary for more realistic Chinese characters
runes[j] = Dict.GetRandomCharacter()
} else {
// Fallback to random generation
log.Fatalln("CRITICAL ERROR: Dictionary not loaded")
}
}
}
s.Result.WordInfo[i].Text = string(runes)
} else {
// Replace with random English characters
runes := []rune(word.Text)
for j := range runes {
if (runes[j] >= 'a' && runes[j] <= 'z') || (runes[j] >= 'A' && runes[j] <= 'Z') {
if runes[j] >= 'a' && runes[j] <= 'z' {
runes[j] = rune('a' + rand.Intn(26))
} else {
runes[j] = rune('A' + rand.Intn(26))
}
}
}
s.Result.WordInfo[i].Text = string(runes)
}
}
}
func asrResultObfuscate(r *http.Request, body []byte, replacePercentage int) ([]byte, error) {
if r.URL.Path != "/webcast/review/client_ai/upload_asr_result/" {
return nil, fmt.Errorf("not an asr request")
}
if !shouldApplyASRReplacement(replacePercentage) {
return nil, nil
}
obj := SpeechRecognitionResponse{}
err := obj.FromJSON(string(body))
if err != nil {
return nil, fmt.Errorf("failed to unmarshal JSON: %w", err)
}
if len(obj.Result.WordInfo) == 0 {
return nil, fmt.Errorf("no word info")
}
obj.asrReplaceRandom()
jsonData, err := obj.ToJSON()
if err != nil {
return nil, fmt.Errorf("failed to marshal to JSON: %w", err)
}
return jsonData, nil
}
func shouldApplyASRReplacement(replacePercentage int) bool {
if replacePercentage <= 0 {
return false
}
if replacePercentage >= 100 {
return true
}
return rand.Intn(100) < replacePercentage
}