mitm/dict.go

152 lines
3.9 KiB
Go

package main
import (
"bufio"
"math/rand"
"os"
"strings"
)
// ChineseDict represents a dictionary containing lines of Chinese text
type ChineseDict struct {
lines []string
currentLineIndex int
currentCharIndex int
currentLineRunes []rune
}
// NewChineseDict creates a new ChineseDict instance and loads lines from dict.txt
func NewChineseDict(filePath string) (*ChineseDict, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer file.Close()
var lines []string
scanner := bufio.NewScanner(file)
buf := make([]byte, 0, bufio.MaxScanTokenSize)
scanner.Buffer(buf, 5*1024*1024) // 5MB buffer to handle long lines
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line != "" { // Skip empty lines
lines = append(lines, line)
}
}
if err := scanner.Err(); err != nil {
return nil, err
}
cd := &ChineseDict{
lines: lines,
currentLineIndex: -1, // Will be set when first character is requested
currentCharIndex: 0,
}
return cd, nil
}
// isValidCharacter checks if a rune is a Chinese character or English letter
func (cd *ChineseDict) isValidCharacter(r rune) bool {
// Check if it's a Chinese character (CJK Unified Ideographs)
if r >= 0x4E00 && r <= 0x9FFF {
return true
}
// Check if it's an English letter
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') {
return true
}
return false
}
// selectNewRandomLine selects a new random line and resets character index
func (cd *ChineseDict) selectNewRandomLine() {
if len(cd.lines) == 0 {
return
}
cd.currentLineIndex = rand.Intn(len(cd.lines))
cd.currentLineRunes = []rune(cd.lines[cd.currentLineIndex])
cd.currentCharIndex = 0
}
// GetRandomCharacter returns Chinese characters or English words from random lines
// It picks a random line and iterates through characters, skipping punctuation
func (cd *ChineseDict) GetRandomCharacter() rune {
if len(cd.lines) == 0 {
return 0 // Return null rune if no lines available
}
if len(cd.lines) == 1 {
if cd.currentLineIndex == -1 {
cd.selectNewRandomLine()
}
// Get random index
randomIndex := rand.Intn(len(cd.currentLineRunes))
cd.currentCharIndex = randomIndex + 1 // Move index forward for next call, align with multiline version for testing
return cd.currentLineRunes[randomIndex]
}
// Keep track of attempts to avoid infinite recursion
maxAttempts := len(cd.lines) * 2 // Try each line at least twice
attempts := 0
for attempts < maxAttempts {
// If this is the first call or we've reached the end of current line, select a new line
if cd.currentLineIndex == -1 || cd.currentCharIndex >= len(cd.currentLineRunes) {
cd.selectNewRandomLine()
}
// Find the next valid character in the current line
for cd.currentCharIndex < len(cd.currentLineRunes) {
currentChar := cd.currentLineRunes[cd.currentCharIndex]
cd.currentCharIndex++
if cd.isValidCharacter(currentChar) {
return currentChar
}
}
// If we've exhausted the current line without finding a valid character,
// mark it for retry and continue
attempts++
cd.currentCharIndex = len(cd.currentLineRunes) // Force line selection on next iteration
}
// If we've tried all lines multiple times and found no valid characters, return null
return 0
}
// GetRandomString returns a string of random Chinese characters with specified length
func (cd *ChineseDict) GetRandomString(length int) string {
if len(cd.lines) == 0 || length <= 0 {
return ""
}
result := make([]rune, length)
for i := range length {
result[i] = cd.GetRandomCharacter()
}
return string(result)
}
// GetLineCount returns the total number of lines in the dictionary
func (cd *ChineseDict) GetLineCount() int {
return len(cd.lines)
}
// GetCharacterCount returns the total number of valid characters in the dictionary
func (cd *ChineseDict) GetCharacterCount() int {
count := 0
for _, line := range cd.lines {
for _, r := range line {
if cd.isValidCharacter(r) {
count++
}
}
}
return count
}