mitm/dict.go

package main

import (
	"bufio"
	"math/rand"
	"os"
	"strings"
)

// ChineseDict represents a dictionary containing lines of Chinese text
type ChineseDict struct {
	lines            []string
	currentLineIndex int
	currentCharIndex int
	currentLineRunes []rune
}

// NewChineseDict creates a new ChineseDict instance and loads lines from dict.txt
func NewChineseDict(filePath string) (*ChineseDict, error) {
	file, err := os.Open(filePath)
	if err != nil {
		return nil, err
	}
	defer file.Close()

	var lines []string
	scanner := bufio.NewScanner(file)
	buf := make([]byte, 0, bufio.MaxScanTokenSize)
	scanner.Buffer(buf, 5*1024*1024) // 5MB buffer to handle long lines
	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		if line != "" { // Skip empty lines
			lines = append(lines, line)
		}
	}

	if err := scanner.Err(); err != nil {
		return nil, err
	}

	cd := &ChineseDict{
		lines:            lines,
		currentLineIndex: -1, // Will be set when first character is requested
		currentCharIndex: 0,
	}

	return cd, nil
}

// isValidCharacter checks if a rune is a Chinese character or English letter
func (cd *ChineseDict) isValidCharacter(r rune) bool {
	// Check if it's a Chinese character (CJK Unified Ideographs)
	if r >= 0x4E00 && r <= 0x9FFF {
		return true
	}
	// Check if it's an English letter
	if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') {
		return true
	}
	return false
}

// selectNewRandomLine selects a new random line and resets character index
func (cd *ChineseDict) selectNewRandomLine() {
	if len(cd.lines) == 0 {
		return
	}
	cd.currentLineIndex = rand.Intn(len(cd.lines))
	cd.currentLineRunes = []rune(cd.lines[cd.currentLineIndex])
	cd.currentCharIndex = 0
}

// GetRandomCharacter returns Chinese characters or English words from random lines
// It picks a random line and iterates through characters, skipping punctuation
func (cd *ChineseDict) GetRandomCharacter() rune {
	if len(cd.lines) == 0 {
		return 0 // Return null rune if no lines available
	}

	if len(cd.lines) == 1 {
		if cd.currentLineIndex == -1 {
			cd.selectNewRandomLine()
		}
		// Get random index
		randomIndex := rand.Intn(len(cd.currentLineRunes))
		cd.currentCharIndex = randomIndex + 1 // Move index forward for next call, align with multiline version for testing

		return cd.currentLineRunes[randomIndex]
	}

	// Keep track of attempts to avoid infinite recursion
	maxAttempts := len(cd.lines) * 2 // Try each line at least twice
	attempts := 0

	for attempts < maxAttempts {
		// If this is the first call or we've reached the end of current line, select a new line
		if cd.currentLineIndex == -1 || cd.currentCharIndex >= len(cd.currentLineRunes) {
			cd.selectNewRandomLine()
		}

		// Find the next valid character in the current line
		for cd.currentCharIndex < len(cd.currentLineRunes) {
			currentChar := cd.currentLineRunes[cd.currentCharIndex]
			cd.currentCharIndex++

			if cd.isValidCharacter(currentChar) {
				return currentChar
			}
		}

		// If we've exhausted the current line without finding a valid character,
		// mark it for retry and continue
		attempts++
		cd.currentCharIndex = len(cd.currentLineRunes) // Force line selection on next iteration
	}

	// If we've tried all lines multiple times and found no valid characters, return null
	return 0
}

// GetRandomString returns a string of random Chinese characters with specified length
func (cd *ChineseDict) GetRandomString(length int) string {
	if len(cd.lines) == 0 || length <= 0 {
		return ""
	}

	result := make([]rune, length)
	for i := range length {
		result[i] = cd.GetRandomCharacter()
	}

	return string(result)
}

// GetLineCount returns the total number of lines in the dictionary
func (cd *ChineseDict) GetLineCount() int {
	return len(cd.lines)
}

// GetCharacterCount returns the total number of valid characters in the dictionary
func (cd *ChineseDict) GetCharacterCount() int {
	count := 0
	for _, line := range cd.lines {
		for _, r := range line {
			if cd.isValidCharacter(r) {
				count++
			}
		}
	}
	return count
}