#!/usr/bin/env python3
"""
Phase 1 cleanup script for marker-pdf output.
Fixes page headers, hyphenation, sup tags, OCR artifacts.
Produces cleaned markdown + manual review report.
"""

import re
import sys
from pathlib import Path

INPUT_FILE = "Paul_Hoffman_The_Man_Who_Loved_Only_Numbers.md"
OUTPUT_FILE = "book_cleaned.md"
REPORT_FILE = "review_report.txt"

# Page headers that repeat — these are NOT real chapter titles.
# They appear as: #### **HEADER TEXT**
# We match loosely to catch OCR variants.
PAGE_HEADERS = [
    "STRAIGHT FROM THE BOOK",
    "EPSZI'S ENIGMA",
    "PROBLEMS WITH SAM AND JOE",
    "PROBLEMS WITH SAM AND .JOE",
    '"GOD MADE THE INTEGERS"',
    "GOD MADE THE INTEGERS",
    "MARGINAL REVENGE",
    "EINSTEIN YS. DOSTOYEYSKY",
    "EINSTEIN YS. DOSTOYEVSKY",
    "EINSTEIN VS. DOSTOYEVSKY",
    "GETTING THE GOAT",
    "ACKNOWLEDGMENTS AND SOURCE NOTES",
]

# Real chapter titles (appear once, are actual chapter starts)
CHAPTER_TITLE_FIXES = {
    "CHAPTER 1t": "CHAPTER π",
    "CHAPTER <sup>0</sup>": "CHAPTER 0",
    "CHAPTER <sup>3</sup>": "CHAPTER 3",
    "CHAPTER <sup>7</sup>": "CHAPTER 7",
}

# Known OCR word fixes (exact word replacements)
OCR_WORD_FIXES = {
    "mlSSlOn": "mission",
    "ematlclans": "ematicians",
    "tim~": "time",
    "cert~inty": "certainty",
    "saYIng": "saying",
    "principIes": "principles",
    "fonnula": "formula",
    "nmnber": "number",
    "Cannichael": "Carmichael",
    "reca11e": "recalle",
    "VazsonYl": "Vázsonyi",
    ".JOE": "JOE",
}


def load_lines(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.readlines()


def save_text(path, text):
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)


def is_page_header(line):
    """Check if a line is a repeated page header."""
    stripped = line.strip()
    if not stripped.startswith("#### **"):
        return False
    # Extract the text between #### ** and **
    m = re.match(r'^####\s+\*\*(.+?)\*\*\s*$', stripped)
    if not m:
        return False
    header_text = m.group(1).strip()
    for ph in PAGE_HEADERS:
        if ph in header_text:
            return True
    return False


def fix_chapter_titles(text):
    """Fix mangled chapter titles."""
    for wrong, right in CHAPTER_TITLE_FIXES.items():
        text = text.replace(f"#### **{wrong}**", f"## {right}")
    # Fix remaining real chapter titles:
    # DR. WORST CASE appears once — it's a real chapter title
    # CHAPTER e — real (Erdős used e as chapter number)
    text = text.replace("#### **DR. WORST CASE**", "## DR. WORST CASE")
    text = text.replace("#### **CHAPTER e**", "## CHAPTER e")
    return text


def fix_false_sup_tags(text):
    """
    Remove <sup> tags that are false positives (single prose letters).
    Keep <sup> tags that are real exponents (numbers after digits/variables).
    """
    # Pattern: a letter/digit immediately before <sup>NUMBERS</sup>
    # These are REAL exponents — we KEEP them.
    # Examples: 2<sup>67</sup>, a<sup>3</sup>, n<sup>2</sup>

    # Pattern: <sup> containing a single common letter in prose context
    # These are FALSE — remove the tags but keep the letter.
    # Examples: <sup>I</sup>, <sup>a</sup>, <sup>A</sup>, <sup>0</sup> (when it means "o")

    def sup_replacer(match):
        before = match.group(1)  # character before <sup>
        content = match.group(2)  # content inside <sup>
        after = match.group(3)  # character after </sup>

        # If content is purely numeric and preceded by a digit, variable letter,
        # closing paren, or is part of a math expression — it's a real exponent
        if re.match(r'^\d+$', content):
            # Check if preceded by a digit, letter that looks like a variable,
            # or closing paren
            if before and re.match(r'[0-9a-zA-Z\)]', before):
                return match.group(0)  # Keep as-is

        # If content is a single letter commonly misidentified
        if content in ('I', 'A', 'a', 'O', '0', 'o', 'd', 'b', 'n', 'k',
                       'z', 'X', 'B', '1', 'e'):
            # Special case: '0' after a digit is likely an exponent
            if content == '0' and before and before.isdigit():
                return match.group(0)  # Keep
            # Special case: '1' after a digit is likely an exponent
            if content == '1' and before and before.isdigit():
                return match.group(0)  # Keep
            # Otherwise, remove the sup tags
            return (before or '') + content + (after or '')

        # Default: keep the sup tags
        return match.group(0)

    # Match: (char_before)<sup>(content)</sup>(char_after)
    # We capture one char before and after for context
    text = re.sub(
        r'(.)?<sup>(.*?)</sup>(.)?',
        sup_replacer,
        text
    )
    return text


def fix_ocr_words(text):
    """Fix known OCR errors."""
    for wrong, right in OCR_WORD_FIXES.items():
        text = text.replace(wrong, right)
    return text


def remove_page_headers(lines):
    """Remove lines that are repeated page headers."""
    result = []
    for line in lines:
        if is_page_header(line):
            continue
        result.append(line)
    return result


def remove_orphan_bullets(lines):
    """Remove lines that are just a bullet marker (•) from page breaks."""
    result = []
    for line in lines:
        if line.strip() == '•':
            continue
        result.append(line)
    return result


def rejoin_hyphenated_words(text):
    """
    Rejoin words that were hyphenated across page breaks.
    Pattern: word-\n\n(optional blank lines)\nrest_of_word
    The hyphen at end of line means the word continues.
    """
    # Match: word fragment ending with hyphen, followed by blank lines,
    # then a lowercase continuation
    # This handles cases like "prob-\n\n\nlems" -> "problems"
    pattern = r'(\w+)-\s*\n(\s*\n)+([a-z])'

    def rejoin(match):
        prefix = match.group(1)
        continuation = match.group(3)
        return prefix + continuation

    # Apply multiple times in case of nested patterns
    prev = None
    while prev != text:
        prev = text
        text = re.sub(pattern, rejoin, text)

    return text


def collapse_excessive_blank_lines(text):
    """Reduce 3+ consecutive blank lines to 2."""
    return re.sub(r'\n{4,}', '\n\n\n', text)


def find_suspicious_lines(text):
    """
    Find lines that likely have OCR garbage and need manual review.
    Returns list of (line_number, line_text, reason).
    """
    issues = []
    lines = text.split('\n')

    for i, line in enumerate(lines, 1):
        # Skip short lines, blank lines, image references
        if len(line.strip()) < 10:
            continue
        if line.strip().startswith('!['):
            continue

        # Check for remaining <sup> tags that might be wrong
        sup_matches = re.findall(r'<sup>(.*?)</sup>', line)
        for m in sup_matches:
            # Single non-numeric content that isn't clearly an exponent
            if not re.match(r'^\d+$', m) and len(m) <= 2:
                issues.append((i, line.strip()[:120], f"Suspicious <sup>{m}</sup>"))
                break

        # Check for unusual character sequences (OCR garbage indicators)
        # Multiple uppercase in middle of lowercase word
        if re.search(r'[a-z][A-Z]{2,}[a-z]', line):
            issues.append((i, line.strip()[:120], "Mixed case pattern (possible OCR error)"))
            continue

        # Tilde in middle of word
        if re.search(r'\w~\w', line):
            issues.append((i, line.strip()[:120], "Tilde in word"))
            continue

        # Multiple apostrophes or quotes clustered
        if re.search(r"'{3,}", line):
            issues.append((i, line.strip()[:120], "Multiple apostrophes (OCR artifact)"))
            continue

        # Backslash in prose (not in image paths)
        if '\\' in line and '![' not in line and 'images/' not in line:
            issues.append((i, line.strip()[:120], "Backslash in text"))
            continue

    return issues


def generate_report(issues, output_path):
    """Write the manual review report."""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("=" * 70 + "\n")
        f.write("MANUAL REVIEW REPORT\n")
        f.write("=" * 70 + "\n\n")
        f.write(f"Found {len(issues)} lines that may need manual correction.\n")
        f.write("Open book_cleaned.md and fix these lines.\n\n")
        f.write("-" * 70 + "\n\n")

        for line_num, text, reason in issues:
            f.write(f"Line {line_num}: [{reason}]\n")
            f.write(f"  {text}\n\n")

        f.write("-" * 70 + "\n")
        f.write("END OF REPORT\n")


def main():
    input_path = Path(INPUT_FILE)
    if not input_path.exists():
        print(f"Error: {INPUT_FILE} not found.")
        sys.exit(1)

    print(f"Reading {INPUT_FILE}...")
    lines = load_lines(input_path)
    print(f"  {len(lines)} lines read.")

    # Step 1: Remove page headers
    print("Step 1: Removing repeated page headers...")
    lines = remove_page_headers(lines)
    print(f"  {len(lines)} lines remaining.")

    # Step 2: Remove orphan bullet markers
    print("Step 2: Removing orphan bullet markers...")
    lines = remove_orphan_bullets(lines)
    print(f"  {len(lines)} lines remaining.")

    # Join into single text for remaining operations
    text = ''.join(lines)

    # Step 3: Fix chapter titles
    print("Step 3: Fixing chapter titles...")
    text = fix_chapter_titles(text)

    # Step 4: Rejoin hyphenated words
    print("Step 4: Rejoining hyphenated words across page breaks...")
    text = rejoin_hyphenated_words(text)

    # Step 5: Fix false <sup> tags
    print("Step 5: Fixing false superscript tags...")
    text = fix_false_sup_tags(text)

    # Step 6: Fix known OCR words
    print("Step 6: Fixing known OCR errors...")
    text = fix_ocr_words(text)

    # Step 7: Collapse excessive blank lines
    print("Step 7: Collapsing excessive blank lines...")
    text = collapse_excessive_blank_lines(text)

    # Save cleaned file
    print(f"Saving cleaned file to {OUTPUT_FILE}...")
    save_text(OUTPUT_FILE, text)

    # Step 8: Find suspicious lines for manual review
    print("Step 8: Scanning for lines needing manual review...")
    issues = find_suspicious_lines(text)
    print(f"  Found {len(issues)} suspicious lines.")

    # Generate report
    print(f"Saving review report to {REPORT_FILE}...")
    generate_report(issues, REPORT_FILE)

    # Summary
    print("\n" + "=" * 50)
    print("DONE!")
    print(f"  Cleaned file: {OUTPUT_FILE}")
    print(f"  Review report: {REPORT_FILE} ({len(issues)} items)")
    print("=" * 50)
    print("\nNext steps:")
    print(f"  1. Read {REPORT_FILE}")
    print(f"  2. Open {OUTPUT_FILE} in a text editor")
    print(f"  3. Fix the flagged lines manually")
    print(f"  4. Verify a few pages look correct by spot-checking")


if __name__ == "__main__":
    main()