#!/usr/bin/env python3
"""
Phase 1 cleanup - pass 2.
Fixes tildes, remaining OCR errors, and specific known-bad passages.
"""

import re
from pathlib import Path

INPUT_FILE = "book_cleaned.md"
OUTPUT_FILE = "book_cleaned2.md"
REPORT_FILE = "review_report2.txt"

# Tilde fixes: word with tilde -> correct word
TILDE_FIXES = {
    "t~o": "two",
    "uniq~e": "unique",
    "certa~nty": "certainty",
    "~imilar": "similar",
    "Vaz~onyi": "Vázsonyi",
    "pris~ners": "prisoners",
    "Fran~oise": "Françoise",
    "expect~d": "expected",
    "~n Budapest": "in Budapest",
    "you'~e": "you're",
    "cal~ulus": "calculus",
    "mathe~aticians": "mathematicians",
    "ne~work": "network",
    "algorit~m": "algorithm",
    "sev~nth": "seventh",
    "~as thought": "was thought",
    "every~here": "everywhere",
    "around~'": "around.'",
    "mathe~aticians": "mathematicians",
}

# Additional OCR word fixes
OCR_FIXES = {
    "promIsIng": "promising",
    "UIam": "Ulam",
    "cl.dding": "adding",
    "\\12)": "√2)",
    "\\12,": "√2,",
    "the\\_bin-packing": "the bin-packing",
    "mother's\\_\\_": "mother's __",
    "freefor-all": "free-for-all",
    "Laz16": "László",
    "P6sa": "Pósa",
    "S6s": "Sós",
    "V6s": "Vós",
    "GoHman": "Goffman",
    "fjve": "five",
    "nmnber": "number",
    "e1tl": "eπi",
    "payout": "pay out",
    "W olfskehl": "Wolfskehl",
}

# The garbled passage on line 27 — exact match and replacement
GARBLED_LINE27_FIND = (
    '"Th\' ere IS an 0 ld\'\'\'"d saying, sal Erd"os. '
    '*"7\\T* 1 *von numerantur, se*d *ponderantur*'
)
GARBLED_LINE27_REPLACE = (
    '"There is an old saying," said Erdős. '
    '"Non numerantur, sed ponderantur'
)

# Another garbled passage (line 559 area)
GARBLED_VAZSONYI_FIND = 'VazsonYl <sup>I</sup> • reca11e , d'
GARBLED_VAZSONYI_REPLACE = 'Vázsonyi recalled'

# Fix sup tags that got partially cleaned
SUP_FIXES = {
    '<sup>=</sup>': '=',
    '<sup>x</sup>': 'x',
}


def load_text(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()


def save_text(path, text):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)


def fix_tildes(text):
    """Replace known tilde-corrupted words."""
    count = 0
    for wrong, right in TILDE_FIXES.items():
        if wrong in text:
            n = text.count(wrong)
            text = text.replace(wrong, right)
            count += n
    # Catch any remaining tildes in the middle of words
    # that we might have missed
    return text, count


def fix_ocr(text):
    """Fix remaining OCR errors."""
    count = 0
    for wrong, right in OCR_FIXES.items():
        if wrong in text:
            n = text.count(wrong)
            text = text.replace(wrong, right)
            count += n
    return text, count


def fix_garbled_passages(text):
    """Fix specific known garbled passages."""
    count = 0
    if GARBLED_LINE27_FIND in text:
        text = text.replace(GARBLED_LINE27_FIND, GARBLED_LINE27_REPLACE)
        count += 1
    if GARBLED_VAZSONYI_FIND in text:
        text = text.replace(GARBLED_VAZSONYI_FIND, GARBLED_VAZSONYI_REPLACE)
        count += 1
    return text, count


def fix_remaining_sups(text):
    """Fix remaining false sup tags."""
    count = 0
    for wrong, right in SUP_FIXES.items():
        if wrong in text:
            n = text.count(wrong)
            text = text.replace(wrong, right)
            count += n
    return text, count


def fix_escaped_dollars(text):
    """Convert \\$ to $ — these are just dollar amounts in prose."""
    # Only fix \$ that are clearly money (followed by digit)
    count = text.count('\\$')
    text = text.replace('\\$', '$')
    return text, count


def find_remaining_issues(text):
    """Find any remaining suspicious patterns for manual review."""
    issues = []
    lines = text.split('\n')

    for i, line in enumerate(lines, 1):
        if len(line.strip()) < 10:
            continue
        if line.strip().startswith('!['):
            continue
        if line.strip().startswith('$$'):
            continue
        if line.strip().startswith('$') and line.strip().endswith('$'):
            continue

        # Remaining tildes in words
        m = re.search(r'\w~\w', line)
        if m:
            issues.append((i, line.strip()[:120], f"Remaining tilde: ...{m.group()}..."))
            continue

        # Mixed case in middle of word (OCR)
        m = re.search(r'[a-z][A-Z]{2,}[a-z]', line)
        if m:
            issues.append((i, line.strip()[:120], f"Mixed case: ...{m.group()}..."))
            continue

        # Multiple apostrophes
        if re.search(r"'{3,}", line):
            issues.append((i, line.strip()[:120], "Multiple apostrophes"))
            continue

        # Backslash NOT in LaTeX, not \$ (already fixed), not in image path
        if '\\' in line:
            # Skip lines that are clearly LaTeX
            if '$$' in line or '\\times' in line or '\\dots' in line:
                continue
            if '\\aleph' in line or '\\begin' in line or '\\pi' in line:
                continue
            if '![' in line or 'images/' in line:
                continue
            # Flag the rest
            issues.append((i, line.strip()[:120], "Backslash in prose"))
            continue

    return issues


def generate_report(issues, output_path):
    """Write the manual review report."""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("=" * 70 + "\n")
        f.write("MANUAL REVIEW REPORT - PASS 2\n")
        f.write("=" * 70 + "\n\n")
        f.write(f"Found {len(issues)} lines that may still need manual correction.\n")
        f.write("Open book_cleaned2.md and fix these lines.\n\n")
        f.write("-" * 70 + "\n\n")

        for line_num, text, reason in issues:
            f.write(f"Line {line_num}: [{reason}]\n")
            f.write(f"  {text}\n\n")

        f.write("-" * 70 + "\n")
        f.write("END OF REPORT\n")


def main():
    input_path = Path(INPUT_FILE)
    if not input_path.exists():
        print(f"Error: {INPUT_FILE} not found.")
        return

    print(f"Reading {INPUT_FILE}...")
    text = load_text(input_path)
    print(f"  {len(text)} characters, {len(text.splitlines())} lines.")

    print("Step 1: Fixing tilde-corrupted words...")
    text, n = fix_tildes(text)
    print(f"  Fixed {n} occurrences.")

    print("Step 2: Fixing remaining OCR errors...")
    text, n = fix_ocr(text)
    print(f"  Fixed {n} occurrences.")

    print("Step 3: Fixing known garbled passages...")
    text, n = fix_garbled_passages(text)
    print(f"  Fixed {n} passages.")

    print("Step 4: Fixing remaining false <sup> tags...")
    text, n = fix_remaining_sups(text)
    print(f"  Fixed {n} occurrences.")

    print("Step 5: Fixing escaped dollar signs...")
    text, n = fix_escaped_dollars(text)
    print(f"  Fixed {n} occurrences.")

    print(f"\nSaving cleaned file to {OUTPUT_FILE}...")
    save_text(OUTPUT_FILE, text)

    print("Step 6: Scanning for remaining issues...")
    issues = find_remaining_issues(text)
    print(f"  Found {len(issues)} remaining issues.")

    generate_report(issues, REPORT_FILE)

    print(f"\n{'=' * 50}")
    print("DONE!")
    print(f"  Cleaned file: {OUTPUT_FILE}")
    print(f"  Review report: {REPORT_FILE} ({len(issues)} items)")
    print(f"{'=' * 50}")


if __name__ == "__main__":
    main()
