新增中文词典功能,创建ChineseDict结构体以加载和生成随机中文字符,更新ASR功能以使用词典进行更真实的字符替换。同时,更新.gitignore以排除.zip文件。

This commit is contained in:
wjsjwr 2025-08-22 23:40:12 +08:00
parent 0dd2c38226
commit a26a612a5b
6 changed files with 199 additions and 32 deletions

1
.gitignore vendored
View File

@ -23,3 +23,4 @@ go.work
*_dumps/
out/
log
*.zip

71
DICT_README.md Normal file
View File

@ -0,0 +1,71 @@
# Chinese Dictionary Implementation
This implementation provides a `ChineseDict` struct that loads Chinese characters from `dict.txt` and provides functionality to generate random Chinese characters.
## Features
- **Load Chinese characters**: Reads `dict.txt` and extracts all Chinese characters (Unicode range 0x4E00-0x9FFF)
- **Random character generation**: Get single random Chinese characters
- **Random string generation**: Generate strings of random Chinese characters with specified length
- **Character counting**: Get the total number of unique Chinese characters loaded
## Usage
### Basic Usage
```go
// Create a new dictionary instance
dict, err := NewChineseDict("dict.txt")
if err != nil {
log.Fatalf("Error loading dictionary: %v", err)
}
// Get a single random Chinese character
randomChar := dict.GetRandomCharacter()
fmt.Printf("Random character: %c\n", randomChar)
// Get a random string of 5 Chinese characters
randomString := dict.GetRandomString(5)
fmt.Printf("Random string: %s\n", randomString)
// Get the total number of characters in dictionary
count := dict.GetCharacterCount()
fmt.Printf("Total characters: %d\n", count)
```
### Demo
Run the demo to see the functionality in action:
```bash
go run . -dict
```
This will display:
- Total number of Chinese characters loaded
- 10 random single characters
- Random strings of different lengths (3, 5, 8, 10 characters)
## Integration with ASR
The dictionary is automatically integrated with the ASR (Automatic Speech Recognition) functionality. When processing speech recognition results, the system will:
1. Try to load the dictionary from `dict.txt`
2. Use dictionary characters for more realistic Chinese character replacement
3. Fall back to random generation if dictionary loading fails
## File Structure
- `dict.go` - Main dictionary implementation
- `dict.txt` - Source file containing Chinese characters
- `asr.go` - ASR functionality with dictionary integration
- `main.go` - Main application with demo functionality
## Requirements
- Go 1.16 or later (uses `os.ReadFile`)
- `dict.txt` file in the same directory as the executable
## Character Statistics
The current `dict.txt` contains **479,939** Chinese characters, providing a rich source for realistic random character generation.

39
asr.go
View File

@ -4,10 +4,14 @@ import (
"bytes"
"encoding/json"
"fmt"
"log"
"math/rand"
"net/http"
)
// Try to load the dictionary for better Chinese character replacement
var Dict *ChineseDict
// WordInfo represents individual word information with timing and confidence
type WordInfo struct {
Confidence float64 `json:"confidence"`
@ -64,10 +68,6 @@ func (s *SpeechRecognitionResponse) FromJSON(jsonStr string) error {
return nil
}
func (s *SpeechRecognitionResponse) asrReplaceEmpty() {
s.Result.WordInfo = []WordInfo{}
}
func (s *SpeechRecognitionResponse) asrReplaceRandom() {
for i, word := range s.Result.WordInfo {
if word.Text == "" || word.Text == " " {
@ -82,24 +82,29 @@ func (s *SpeechRecognitionResponse) asrReplaceRandom() {
}
}
if containsChinese {
// Replace with random Chinese characters
// Replace with random Chinese characters from dictionary if available
runes := []rune(word.Text)
for i := range runes {
if runes[i] >= 0x4e00 && runes[i] <= 0x9fff {
// Generate random Chinese character in common range
runes[i] = rune(0x4e00 + rand.Intn(0x9fff-0x4e00+1))
for j := range runes {
if runes[j] >= 0x4e00 && runes[j] <= 0x9fff {
if Dict != nil {
// Use dictionary for more realistic Chinese characters
runes[j] = Dict.GetRandomCharacter()
} else {
// Fallback to random generation
log.Fatalln("CRITICAL ERROR: Dictionary not loaded")
}
}
}
s.Result.WordInfo[i].Text = string(runes)
} else {
// Replace with random English characters
runes := []rune(word.Text)
for i := range runes {
if (runes[i] >= 'a' && runes[i] <= 'z') || (runes[i] >= 'A' && runes[i] <= 'Z') {
if runes[i] >= 'a' && runes[i] <= 'z' {
runes[i] = rune('a' + rand.Intn(26))
for j := range runes {
if (runes[j] >= 'a' && runes[j] <= 'z') || (runes[j] >= 'A' && runes[j] <= 'Z') {
if runes[j] >= 'a' && runes[j] <= 'z' {
runes[j] = rune('a' + rand.Intn(26))
} else {
runes[i] = rune('A' + rand.Intn(26))
runes[j] = rune('A' + rand.Intn(26))
}
}
}
@ -109,7 +114,6 @@ func (s *SpeechRecognitionResponse) asrReplaceRandom() {
}
func asrResultObfuscate(r *http.Request, body []byte) ([]byte, error) {
fmt.Printf("asrResultObfuscate: %s\n", r.URL.Path)
if r.URL.Path != "/webcast/review/client_ai/upload_asr_result/" {
return nil, fmt.Errorf("not an asr request")
}
@ -121,11 +125,8 @@ func asrResultObfuscate(r *http.Request, body []byte) ([]byte, error) {
if len(obj.Result.WordInfo) == 0 {
return nil, fmt.Errorf("no word info")
}
if rand.Intn(100) < 50 {
obj.asrReplaceEmpty()
} else {
obj.asrReplaceRandom()
}
jsonData, err := obj.ToJSON()
if err != nil {
return nil, fmt.Errorf("failed to marshal to JSON: %w", err)

60
dict.go Normal file
View File

@ -0,0 +1,60 @@
package main
import (
"math/rand"
"os"
)
// ChineseDict represents a dictionary containing Chinese characters
type ChineseDict struct {
characters []rune
}
// NewChineseDict creates a new ChineseDict instance and loads content from dict.txt
func NewChineseDict(filePath string) (*ChineseDict, error) {
// Read the file content
content, err := os.ReadFile(filePath)
if err != nil {
return nil, err
}
// Convert bytes to string and then to runes to properly handle Chinese characters
text := string(content)
runes := []rune(text)
return &ChineseDict{
characters: runes,
}, nil
}
// GetRandomCharacter returns a random Chinese character from the dictionary
func (cd *ChineseDict) GetRandomCharacter() rune {
if len(cd.characters) == 0 {
return 0 // Return null rune if no characters available
}
// Get random index
randomIndex := rand.Intn(len(cd.characters))
return cd.characters[randomIndex]
}
// GetRandomString returns a string of random Chinese characters with specified length
func (cd *ChineseDict) GetRandomString(length int) string {
if len(cd.characters) == 0 || length <= 0 {
return ""
}
result := make([]rune, length)
for i := 0; i < length; i++ {
randomIndex := rand.Intn(len(cd.characters))
result[i] = cd.characters[randomIndex]
}
return string(result)
}
// GetCharacterCount returns the total number of Chinese characters in the dictionary
func (cd *ChineseDict) GetCharacterCount() int {
return len(cd.characters)
}

1
dict.txt Normal file

File diff suppressed because one or more lines are too long

39
main.go
View File

@ -48,11 +48,15 @@ type ProxyServer struct {
proxy *goproxy.ProxyHttpServer
server *http.Server
originalProxy string
verbose bool
quiet bool
}
func main() {
// Parse command line flags
var testConnectivity = flag.Bool("test", false, "Test proxy connectivity")
var verbose = flag.Bool("v", false, "Enable verbose mode - dump all traffic instead of only modified requests/responses")
var debugMode = flag.Bool("d", false, "Debug mode - dump modified requests/responses")
flag.Parse()
// Set console to UTF-8 on Windows to prevent garbled text
@ -60,6 +64,14 @@ func main() {
setConsoleUTF8()
}
fmt.Println("Reading dictionary...")
dict, err := NewChineseDict("dict.txt")
if err != nil {
log.Fatalf("Failed to load dictionary: %v", err)
}
Dict = dict
fmt.Printf("Dictionary loaded successfully, size=%d\n", Dict.GetCharacterCount())
fmt.Println("Starting MITM proxy server...")
// Load configuration
@ -80,7 +92,7 @@ func main() {
}
// Create proxy server
proxy, err := NewProxyServer(config)
proxy, err := NewProxyServer(config, *verbose, *debugMode)
if err != nil {
log.Fatalf("Failed to create proxy server: %v", err)
}
@ -151,7 +163,7 @@ func loadConfig(filename string) (*Config, error) {
return parseConfig(filename)
}
func NewProxyServer(config *Config) (*ProxyServer, error) {
func NewProxyServer(config *Config, verbose bool, debugMode bool) (*ProxyServer, error) {
// Load hardcoded P12 certificate for MITM
tlsConfig, err := loadHardcodedCertificate()
if err != nil {
@ -160,12 +172,14 @@ func NewProxyServer(config *Config) (*ProxyServer, error) {
// Create goproxy instance
goProxy := goproxy.NewProxyHttpServer()
goProxy.Verbose = true
goProxy.Verbose = verbose
ps := &ProxyServer{
config: config,
tlsConfig: tlsConfig,
proxy: goProxy,
verbose: verbose,
quiet: !debugMode,
}
// Configure MITM for HTTPS traffic
@ -209,6 +223,10 @@ func (p *ProxyServer) setupHandlers() {
// Log all HTTP requests and capture request body
p.proxy.OnRequest().DoFunc(func(r *http.Request, ctx *goproxy.ProxyCtx) (*http.Request, *http.Response) {
if !p.isDomainOfInterest(r.Host) && p.quiet {
return r, nil
}
// Read request body once and recreate it for both dumping and forwarding
if r.Body != nil {
reqBody, err := io.ReadAll(r.Body)
@ -225,6 +243,9 @@ func (p *ProxyServer) setupHandlers() {
if err != nil && err.Error() != "not an asr request" {
log.Printf("Failed to obfuscate request body: %v", err)
}
if p.quiet && err == nil {
log.Println("[INFO] ASR Request Body Modified")
}
}
if newReqBody != nil {
@ -250,8 +271,13 @@ func (p *ProxyServer) setupHandlers() {
// Log all HTTP responses and dump traffic
p.proxy.OnResponse().DoFunc(func(r *http.Response, ctx *goproxy.ProxyCtx) *http.Response {
if p.quiet {
return r
}
timestamp := time.Now().Format("20060102T15:04:05.000000")
if r != nil {
if p.verbose || p.isDomainOfInterest(ctx.Req.Host) {
fmt.Printf(
"[%s][INFO][Interest=%v] HTTP Response: %s %s\n",
timestamp,
@ -259,6 +285,7 @@ func (p *ProxyServer) setupHandlers() {
r.Status,
ctx.Req.URL.String(),
)
}
// Get request body from context (if available)
var reqBody []byte
@ -284,12 +311,18 @@ func (p *ProxyServer) setupHandlers() {
r.ContentLength = int64(len(respBody))
// Dump traffic to file with both request and response bodies
// Only dump if verbose mode is enabled OR if the request was modified
if p.verbose || modifiedBody != nil {
p.dumpHTTPTrafficWithBodies(ctx.Req, r, reqBody, modifiedBody, respBody)
}
} else {
// No response body, but may have request body
// Only dump if verbose mode is enabled OR if the request was modified
if p.verbose || modifiedBody != nil {
p.dumpHTTPTrafficWithBodies(ctx.Req, r, reqBody, modifiedBody, nil)
}
}
}
return r
})
}