From 0dd2c38226e64fe0ffc3bf1f0fb6efdc683eb83f Mon Sep 17 00:00:00 2001 From: wjsjwr Date: Thu, 21 Aug 2025 23:46:17 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9EASR=EF=BC=88=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E8=AF=AD=E9=9F=B3=E8=AF=86=E5=88=AB=EF=BC=89=E7=9B=B8?= =?UTF-8?q?=E5=85=B3=E5=8A=9F=E8=83=BD=EF=BC=8C=E5=8C=85=E6=8B=AC=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0asrReplaceRandom=E6=96=B9=E6=B3=95=E4=BB=A5=E9=9A=8F?= =?UTF-8?q?=E6=9C=BA=E6=9B=BF=E6=8D=A2WordInfo=E4=B8=AD=E7=9A=84=E6=96=87?= =?UTF-8?q?=E6=9C=AC=EF=BC=8C=E6=B7=BB=E5=8A=A0=E7=9B=B8=E5=BA=94=E7=9A=84?= =?UTF-8?q?=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95=E4=BB=A5=E9=AA=8C=E8=AF=81?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E7=9A=84=E6=AD=A3=E7=A1=AE=E6=80=A7=E5=92=8C?= =?UTF-8?q?=E6=80=A7=E8=83=BD=E3=80=82=E5=90=8C=E6=97=B6=EF=BC=8C=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E4=B8=BB=E7=A8=8B=E5=BA=8F=E4=BB=A5=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=E4=BD=93=E7=9A=84=E4=BF=AE=E6=94=B9=E5=92=8C?= =?UTF-8?q?=E6=B5=81=E9=87=8F=E8=BD=AC=E5=82=A8=E5=8A=9F=E8=83=BD=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- asr.go | 134 +++++++++++++++++++++++++++++ asr_test.go | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++++ main.go | 99 ++++++++++----------- 3 files changed, 421 insertions(+), 54 deletions(-) create mode 100644 asr.go create mode 100644 asr_test.go diff --git a/asr.go b/asr.go new file mode 100644 index 0000000..062fd24 --- /dev/null +++ b/asr.go @@ -0,0 +1,134 @@ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "math/rand" + "net/http" +) + +// WordInfo represents individual word information with timing and confidence +type WordInfo struct { + Confidence float64 `json:"confidence"` + EndTime int64 `json:"end_time"` + StartTime int64 `json:"start_time"` + Text string `json:"text"` +} + +// Result represents the speech recognition result +type Result struct { + Language string `json:"language"` + LanguageDetails string `json:"language_details"` + Volume string `json:"volume"` + Source int `json:"source"` + AudioStreamOffset int64 `json:"audio_stream_offset"` + AudioStreamEndTime int64 `json:"audio_stream_end_time"` + BoardcastOffset int64 `json:"boardcast_offset"` + BoardcastEndTime int64 `json:"boardcast_end_time"` + WordInfo []WordInfo `json:"word_info"` +} + +// SpeechRecognitionResponse represents the complete speech recognition response +type SpeechRecognitionResponse struct { + RoomID string `json:"room_id"` + UserID string `json:"user_id"` + StreamID string `json:"stream_id"` + Result Result `json:"result"` + ModelName string `json:"model_name"` +} + +// ToJSON converts the struct to JSON byte array +func (s *SpeechRecognitionResponse) ToJSON() ([]byte, error) { + var buf bytes.Buffer + encoder := json.NewEncoder(&buf) + encoder.SetEscapeHTML(false) + err := encoder.Encode(s) + if err != nil { + return nil, fmt.Errorf("failed to marshal to JSON: %w", err) + } + data := buf.Bytes() + // Remove the trailing newline that Encode adds + if len(data) > 0 && data[len(data)-1] == '\n' { + data = data[:len(data)-1] + } + return data, nil +} + +// FromJSON parses JSON string into the struct +func (s *SpeechRecognitionResponse) FromJSON(jsonStr string) error { + err := json.Unmarshal([]byte(jsonStr), s) + if err != nil { + return fmt.Errorf("failed to unmarshal JSON: %w", err) + } + return nil +} + +func (s *SpeechRecognitionResponse) asrReplaceEmpty() { + s.Result.WordInfo = []WordInfo{} +} + +func (s *SpeechRecognitionResponse) asrReplaceRandom() { + for i, word := range s.Result.WordInfo { + if word.Text == "" || word.Text == " " { + continue + } + // Check if text contains Chinese characters + containsChinese := false + for _, r := range word.Text { + if r >= 0x4e00 && r <= 0x9fff { + containsChinese = true + break + } + } + if containsChinese { + // Replace with random Chinese characters + runes := []rune(word.Text) + for i := range runes { + if runes[i] >= 0x4e00 && runes[i] <= 0x9fff { + // Generate random Chinese character in common range + runes[i] = rune(0x4e00 + rand.Intn(0x9fff-0x4e00+1)) + } + } + s.Result.WordInfo[i].Text = string(runes) + } else { + // Replace with random English characters + runes := []rune(word.Text) + for i := range runes { + if (runes[i] >= 'a' && runes[i] <= 'z') || (runes[i] >= 'A' && runes[i] <= 'Z') { + if runes[i] >= 'a' && runes[i] <= 'z' { + runes[i] = rune('a' + rand.Intn(26)) + } else { + runes[i] = rune('A' + rand.Intn(26)) + } + } + } + s.Result.WordInfo[i].Text = string(runes) + } + } +} + +func asrResultObfuscate(r *http.Request, body []byte) ([]byte, error) { + fmt.Printf("asrResultObfuscate: %s\n", r.URL.Path) + if r.URL.Path != "/webcast/review/client_ai/upload_asr_result/" { + return nil, fmt.Errorf("not an asr request") + } + obj := SpeechRecognitionResponse{} + err := obj.FromJSON(string(body)) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal JSON: %w", err) + } + if len(obj.Result.WordInfo) == 0 { + return nil, fmt.Errorf("no word info") + } + if rand.Intn(100) < 50 { + obj.asrReplaceEmpty() + } else { + obj.asrReplaceRandom() + } + jsonData, err := obj.ToJSON() + if err != nil { + return nil, fmt.Errorf("failed to marshal to JSON: %w", err) + } + return jsonData, nil +} diff --git a/asr_test.go b/asr_test.go new file mode 100644 index 0000000..a156156 --- /dev/null +++ b/asr_test.go @@ -0,0 +1,242 @@ +package main + +import ( + "testing" +) + +// NOTE: These tests are written for the current implementation of asrReplaceRandom, +// which has a bug - it doesn't actually modify the WordInfo slice because it iterates +// over copies of the structs instead of references. The method should use: +// for i := range s.Result.WordInfo instead of for _, word := range s.Result.WordInfo + +func TestSpeechRecognitionResponse_asrReplaceRandom(t *testing.T) { + t.Run("EmptyAndSpaceText", func(t *testing.T) { + response := &SpeechRecognitionResponse{ + Result: Result{ + WordInfo: []WordInfo{ + {Text: "", Confidence: 0.9, StartTime: 100, EndTime: 200}, + {Text: " ", Confidence: 0.8, StartTime: 200, EndTime: 300}, + {Text: "hello", Confidence: 0.95, StartTime: 300, EndTime: 400}, + }, + }, + } + + // Store original values + originalEmpty := response.Result.WordInfo[0].Text + originalSpace := response.Result.WordInfo[1].Text + originalHello := response.Result.WordInfo[2].Text + + response.asrReplaceRandom() + + // NOTE: The current implementation has a bug - it doesn't actually modify the WordInfo slice + // because it iterates over copies, not references. All text should remain unchanged. + if response.Result.WordInfo[0].Text != originalEmpty { + t.Errorf("Expected empty text to remain unchanged, got %q", response.Result.WordInfo[0].Text) + } + if response.Result.WordInfo[1].Text != originalSpace { + t.Errorf("Expected space text to remain unchanged, got %q", response.Result.WordInfo[1].Text) + } + + if response.Result.WordInfo[2].Text == originalHello { + t.Errorf("Expected text to be replaced, got %q", response.Result.WordInfo[2].Text) + } + }) + + t.Run("ChineseCharacterReplacement", func(t *testing.T) { + response := &SpeechRecognitionResponse{ + Result: Result{ + WordInfo: []WordInfo{ + {Text: "你好", Confidence: 0.9, StartTime: 100, EndTime: 200}, + {Text: "世界", Confidence: 0.8, StartTime: 200, EndTime: 300}, + }, + }, + } + + originalTexts := make([]string, len(response.Result.WordInfo)) + for i, word := range response.Result.WordInfo { + originalTexts[i] = word.Text + } + + response.asrReplaceRandom() + + for i, word := range response.Result.WordInfo { + if word.Text == originalTexts[i] { + t.Errorf("Expected Chinese text to be replaced") + } + } + }) + + t.Run("EnglishCharacterReplacement", func(t *testing.T) { + response := &SpeechRecognitionResponse{ + Result: Result{ + WordInfo: []WordInfo{ + {Text: "Hello", Confidence: 0.9, StartTime: 100, EndTime: 200}, + {Text: "WORLD", Confidence: 0.8, StartTime: 200, EndTime: 300}, + {Text: "test", Confidence: 0.7, StartTime: 300, EndTime: 400}, + }, + }, + } + + originalTexts := make([]string, len(response.Result.WordInfo)) + for i, word := range response.Result.WordInfo { + originalTexts[i] = word.Text + } + + response.asrReplaceRandom() + + // Due to the implementation bug, English text should remain unchanged + for i, word := range response.Result.WordInfo { + if word.Text == originalTexts[i] { + t.Errorf("Expected English text to be replaced") + } + } + }) + + t.Run("MixedContent", func(t *testing.T) { + response := &SpeechRecognitionResponse{ + Result: Result{ + WordInfo: []WordInfo{ + {Text: "Hello123你好", Confidence: 0.9, StartTime: 100, EndTime: 200}, + {Text: "Test!@#测试", Confidence: 0.8, StartTime: 200, EndTime: 300}, + }, + }, + } + + originalTexts := make([]string, len(response.Result.WordInfo)) + for i, word := range response.Result.WordInfo { + originalTexts[i] = word.Text + } + + response.asrReplaceRandom() + + // Due to the implementation bug, mixed content should remain unchanged + for i, word := range response.Result.WordInfo { + if word.Text == originalTexts[i] { + t.Errorf("Expected mixed content to be replaced") + } + } + }) + + t.Run("EmptyWordInfoSlice", func(t *testing.T) { + response := &SpeechRecognitionResponse{ + Result: Result{ + WordInfo: []WordInfo{}, + }, + } + + // Should not panic + response.asrReplaceRandom() + + // WordInfo should remain empty + if len(response.Result.WordInfo) != 0 { + t.Errorf("Expected WordInfo to remain empty, got length %d", len(response.Result.WordInfo)) + } + }) + + t.Run("OnlySpecialCharacters", func(t *testing.T) { + response := &SpeechRecognitionResponse{ + Result: Result{ + WordInfo: []WordInfo{ + {Text: "123!@#$%", Confidence: 0.9, StartTime: 100, EndTime: 200}, + {Text: ".,;:()[]", Confidence: 0.8, StartTime: 200, EndTime: 300}, + }, + }, + } + + originalTexts := make([]string, len(response.Result.WordInfo)) + for i, word := range response.Result.WordInfo { + originalTexts[i] = word.Text + } + + response.asrReplaceRandom() + + // Special characters should remain unchanged + for i, word := range response.Result.WordInfo { + if word.Text != originalTexts[i] { + t.Errorf("Expected special characters to remain unchanged: %q -> %q", originalTexts[i], word.Text) + } + } + }) + + t.Run("PreservesNonTextFields", func(t *testing.T) { + response := &SpeechRecognitionResponse{ + RoomID: "room123", + UserID: "user456", + StreamID: "stream789", + Result: Result{ + Language: "zh-CN", + LanguageDetails: "Chinese Simplified", + Volume: "high", + Source: 1, + AudioStreamOffset: 1000, + AudioStreamEndTime: 2000, + BoardcastOffset: 1500, + BoardcastEndTime: 2500, + WordInfo: []WordInfo{ + {Text: "test", Confidence: 0.9, StartTime: 100, EndTime: 200}, + }, + }, + ModelName: "test-model", + } + + // Store original non-text values + originalRoomID := response.RoomID + originalUserID := response.UserID + originalStreamID := response.StreamID + originalLanguage := response.Result.Language + originalConfidence := response.Result.WordInfo[0].Confidence + originalStartTime := response.Result.WordInfo[0].StartTime + originalEndTime := response.Result.WordInfo[0].EndTime + originalModelName := response.ModelName + + response.asrReplaceRandom() + + // All non-text fields should remain unchanged + if response.RoomID != originalRoomID { + t.Errorf("Expected RoomID to remain unchanged: %q -> %q", originalRoomID, response.RoomID) + } + if response.UserID != originalUserID { + t.Errorf("Expected UserID to remain unchanged: %q -> %q", originalUserID, response.UserID) + } + if response.StreamID != originalStreamID { + t.Errorf("Expected StreamID to remain unchanged: %q -> %q", originalStreamID, response.StreamID) + } + if response.Result.Language != originalLanguage { + t.Errorf("Expected Language to remain unchanged: %q -> %q", originalLanguage, response.Result.Language) + } + if response.Result.WordInfo[0].Confidence != originalConfidence { + t.Errorf("Expected Confidence to remain unchanged: %f -> %f", originalConfidence, response.Result.WordInfo[0].Confidence) + } + if response.Result.WordInfo[0].StartTime != originalStartTime { + t.Errorf("Expected StartTime to remain unchanged: %d -> %d", originalStartTime, response.Result.WordInfo[0].StartTime) + } + if response.Result.WordInfo[0].EndTime != originalEndTime { + t.Errorf("Expected EndTime to remain unchanged: %d -> %d", originalEndTime, response.Result.WordInfo[0].EndTime) + } + if response.ModelName != originalModelName { + t.Errorf("Expected ModelName to remain unchanged: %q -> %q", originalModelName, response.ModelName) + } + }) +} + +// Benchmark test to measure performance +func BenchmarkSpeechRecognitionResponse_asrReplaceRandom(b *testing.B) { + response := &SpeechRecognitionResponse{ + Result: Result{ + WordInfo: []WordInfo{ + {Text: "Hello World", Confidence: 0.9, StartTime: 100, EndTime: 200}, + {Text: "你好世界", Confidence: 0.8, StartTime: 200, EndTime: 300}, + {Text: "Test123!@#", Confidence: 0.7, StartTime: 300, EndTime: 400}, + }, + }, + } + + for b.Loop() { + // Create a copy for each iteration to avoid modifying the same data + testResponse := *response + testResponse.Result.WordInfo = make([]WordInfo, len(response.Result.WordInfo)) + copy(testResponse.Result.WordInfo, response.Result.WordInfo) + + testResponse.asrReplaceRandom() + } +} diff --git a/main.go b/main.go index f71c879..5592d59 100644 --- a/main.go +++ b/main.go @@ -37,6 +37,11 @@ type Config struct { } `toml:"dump"` } +type UserData struct { + RequestBody []byte + ModifiedBody []byte +} + type ProxyServer struct { config *Config tlsConfig *tls.Config @@ -204,9 +209,6 @@ func (p *ProxyServer) setupHandlers() { // Log all HTTP requests and capture request body p.proxy.OnRequest().DoFunc(func(r *http.Request, ctx *goproxy.ProxyCtx) (*http.Request, *http.Response) { - timestamp := time.Now().Format("20060102T15:04:05.000000") - fmt.Printf("[%s][INFO][Interest=%v] HTTP Request: %s %s from %s\n", timestamp, p.isDomainOfInterest(r.Host), r.Method, r.URL.String(), r.RemoteAddr) - // Read request body once and recreate it for both dumping and forwarding if r.Body != nil { reqBody, err := io.ReadAll(r.Body) @@ -216,13 +218,30 @@ func (p *ProxyServer) setupHandlers() { } r.Body.Close() - // Recreate the request body so it can be forwarded to the server - r.Body = io.NopCloser(bytes.NewReader(reqBody)) - r.ContentLength = int64(len(reqBody)) + var newReqBody []byte = nil + + if p.isDomainOfInterest(r.Host) { + newReqBody, err = asrResultObfuscate(r, reqBody) + if err != nil && err.Error() != "not an asr request" { + log.Printf("Failed to obfuscate request body: %v", err) + } + } + + if newReqBody != nil { + r.Body = io.NopCloser(bytes.NewReader(newReqBody)) + r.ContentLength = int64(len(newReqBody)) + } else { + // Recreate the request body so it can be forwarded to the server + r.Body = io.NopCloser(bytes.NewReader(reqBody)) + r.ContentLength = int64(len(reqBody)) + } // Store request body in context for later use in response handler if len(reqBody) > 0 { - ctx.UserData = reqBody + ctx.UserData = UserData{ + RequestBody: reqBody, + ModifiedBody: newReqBody, + } } } @@ -233,13 +252,21 @@ func (p *ProxyServer) setupHandlers() { p.proxy.OnResponse().DoFunc(func(r *http.Response, ctx *goproxy.ProxyCtx) *http.Response { timestamp := time.Now().Format("20060102T15:04:05.000000") if r != nil { - fmt.Printf("[%s][INFO][Interest=%v] HTTP Response: %s %s\n", timestamp, p.isDomainOfInterest(ctx.Req.Host), r.Status, ctx.Req.URL.String()) + fmt.Printf( + "[%s][INFO][Interest=%v] HTTP Response: %s %s\n", + timestamp, + p.isDomainOfInterest(ctx.Req.Host), + r.Status, + ctx.Req.URL.String(), + ) // Get request body from context (if available) var reqBody []byte + var modifiedBody []byte if ctx.UserData != nil { - if body, ok := ctx.UserData.([]byte); ok { - reqBody = body + if userData, ok := ctx.UserData.(UserData); ok { + reqBody = userData.RequestBody + modifiedBody = userData.ModifiedBody } } @@ -257,10 +284,10 @@ func (p *ProxyServer) setupHandlers() { r.ContentLength = int64(len(respBody)) // Dump traffic to file with both request and response bodies - p.dumpHTTPTrafficWithBodies(ctx.Req, r, reqBody, respBody) + p.dumpHTTPTrafficWithBodies(ctx.Req, r, reqBody, modifiedBody, respBody) } else { // No response body, but may have request body - p.dumpHTTPTrafficWithBodies(ctx.Req, r, reqBody, nil) + p.dumpHTTPTrafficWithBodies(ctx.Req, r, reqBody, modifiedBody, nil) } } return r @@ -296,49 +323,8 @@ func (p *ProxyServer) Shutdown() { p.server.Shutdown(ctx) } -// dumpHTTPTraffic dumps HTTP request and response to file -func (p *ProxyServer) dumpHTTPTraffic(req *http.Request, resp *http.Response) { - file, err := os.Create(p.getFilePath(req)) - if err != nil { - log.Printf("Failed to create dump file: %v", err) - return - } - defer file.Close() - - // Write request information - fmt.Fprintf(file, "=== REQUEST ===\n") - fmt.Fprintf(file, "%s %s %s\n", req.Method, req.URL.String(), req.Proto) - fmt.Fprintf(file, "Host: %s\n", req.Host) - - // Write all request headers - for name, values := range req.Header { - for _, value := range values { - fmt.Fprintf(file, "%s: %s\n", name, value) - } - } - fmt.Fprintf(file, "\n") - - // Note: Request body handling is done in the request handler to avoid consuming it twice - - // Write response information - if resp != nil { - fmt.Fprintf(file, "\n=== RESPONSE ===\n") - fmt.Fprintf(file, "%s %s\n", resp.Proto, resp.Status) - - // Write all response headers - for name, values := range resp.Header { - for _, value := range values { - fmt.Fprintf(file, "%s: %s\n", name, value) - } - } - fmt.Fprintf(file, "\n") - - // Note: Response body will be handled in the response handler to avoid consuming it - } -} - // dumpHTTPTrafficWithBodies dumps HTTP request and response with both bodies to file -func (p *ProxyServer) dumpHTTPTrafficWithBodies(req *http.Request, resp *http.Response, reqBody []byte, respBody []byte) { +func (p *ProxyServer) dumpHTTPTrafficWithBodies(req *http.Request, resp *http.Response, reqBody []byte, modifiedBody []byte, respBody []byte) { file, err := os.Create(p.getFilePath(req)) if err != nil { log.Printf("Failed to create dump file: %v", err) @@ -364,6 +350,11 @@ func (p *ProxyServer) dumpHTTPTrafficWithBodies(req *http.Request, resp *http.Re fmt.Fprintf(file, "%s\n", string(reqBody)) } + if modifiedBody != nil { + fmt.Fprintf(file, "\n=== MODIFIED REQUEST BODY ===\n") + fmt.Fprintf(file, "%s\n", string(modifiedBody)) + } + // Write response information if resp != nil { fmt.Fprintf(file, "\n=== RESPONSE ===\n")