#!/usr/bin/env python3 """ Phase 1 cleanup - pass 2. Fixes tildes, remaining OCR errors, and specific known-bad passages. """ import re from pathlib import Path INPUT_FILE = "book_cleaned.md" OUTPUT_FILE = "book_cleaned2.md" REPORT_FILE = "review_report2.txt" # Tilde fixes: word with tilde -> correct word TILDE_FIXES = { "t~o": "two", "uniq~e": "unique", "certa~nty": "certainty", "~imilar": "similar", "Vaz~onyi": "Vázsonyi", "pris~ners": "prisoners", "Fran~oise": "Françoise", "expect~d": "expected", "~n Budapest": "in Budapest", "you'~e": "you're", "cal~ulus": "calculus", "mathe~aticians": "mathematicians", "ne~work": "network", "algorit~m": "algorithm", "sev~nth": "seventh", "~as thought": "was thought", "every~here": "everywhere", "around~'": "around.'", "mathe~aticians": "mathematicians", } # Additional OCR word fixes OCR_FIXES = { "promIsIng": "promising", "UIam": "Ulam", "cl.dding": "adding", "\\12)": "√2)", "\\12,": "√2,", "the\\_bin-packing": "the bin-packing", "mother's\\_\\_": "mother's __", "freefor-all": "free-for-all", "Laz16": "László", "P6sa": "Pósa", "S6s": "Sós", "V6s": "Vós", "GoHman": "Goffman", "fjve": "five", "nmnber": "number", "e1tl": "eπi", "payout": "pay out", "W olfskehl": "Wolfskehl", } # The garbled passage on line 27 — exact match and replacement GARBLED_LINE27_FIND = ( '"Th\' ere IS an 0 ld\'\'\'"d saying, sal Erd"os. ' '*"7\\T* 1 *von numerantur, se*d *ponderantur*' ) GARBLED_LINE27_REPLACE = ( '"There is an old saying," said Erdős. ' '"Non numerantur, sed ponderantur' ) # Another garbled passage (line 559 area) GARBLED_VAZSONYI_FIND = 'VazsonYl ^I • reca11e , d' GARBLED_VAZSONYI_REPLACE = 'Vázsonyi recalled' # Fix sup tags that got partially cleaned SUP_FIXES = { '⁼': '=', '^x': 'x', } def load_text(path): with open(path, 'r', encoding='utf-8') as f: return f.read() def save_text(path, text): with open(path, 'w', encoding='utf-8') as f: f.write(text) def fix_tildes(text): """Replace known tilde-corrupted words.""" count = 0 for wrong, right in TILDE_FIXES.items(): if wrong in text: n = text.count(wrong) text = text.replace(wrong, right) count += n # Catch any remaining tildes in the middle of words # that we might have missed return text, count def fix_ocr(text): """Fix remaining OCR errors.""" count = 0 for wrong, right in OCR_FIXES.items(): if wrong in text: n = text.count(wrong) text = text.replace(wrong, right) count += n return text, count def fix_garbled_passages(text): """Fix specific known garbled passages.""" count = 0 if GARBLED_LINE27_FIND in text: text = text.replace(GARBLED_LINE27_FIND, GARBLED_LINE27_REPLACE) count += 1 if GARBLED_VAZSONYI_FIND in text: text = text.replace(GARBLED_VAZSONYI_FIND, GARBLED_VAZSONYI_REPLACE) count += 1 return text, count def fix_remaining_sups(text): """Fix remaining false sup tags.""" count = 0 for wrong, right in SUP_FIXES.items(): if wrong in text: n = text.count(wrong) text = text.replace(wrong, right) count += n return text, count def fix_escaped_dollars(text): """Convert \\$ to $ — these are just dollar amounts in prose.""" # Only fix \$ that are clearly money (followed by digit) count = text.count('\\$') text = text.replace('\\$', '$') return text, count def find_remaining_issues(text): """Find any remaining suspicious patterns for manual review.""" issues = [] lines = text.split('\n') for i, line in enumerate(lines, 1): if len(line.strip()) < 10: continue if line.strip().startswith('!['): continue if line.strip().startswith('$$'): continue if line.strip().startswith('$') and line.strip().endswith('$'): continue # Remaining tildes in words m = re.search(r'\w~\w', line) if m: issues.append((i, line.strip()[:120], f"Remaining tilde: ...{m.group()}...")) continue # Mixed case in middle of word (OCR) m = re.search(r'[a-z][A-Z]{2,}[a-z]', line) if m: issues.append((i, line.strip()[:120], f"Mixed case: ...{m.group()}...")) continue # Multiple apostrophes if re.search(r"'{3,}", line): issues.append((i, line.strip()[:120], "Multiple apostrophes")) continue # Backslash NOT in LaTeX, not \$ (already fixed), not in image path if '\\' in line: # Skip lines that are clearly LaTeX if '$$' in line or '\\times' in line or '\\dots' in line: continue if '\\aleph' in line or '\\begin' in line or '\\pi' in line: continue if '![' in line or 'images/' in line: continue # Flag the rest issues.append((i, line.strip()[:120], "Backslash in prose")) continue return issues def generate_report(issues, output_path): """Write the manual review report.""" with open(output_path, 'w', encoding='utf-8') as f: f.write("=" * 70 + "\n") f.write("MANUAL REVIEW REPORT - PASS 2\n") f.write("=" * 70 + "\n\n") f.write(f"Found {len(issues)} lines that may still need manual correction.\n") f.write("Open book_cleaned2.md and fix these lines.\n\n") f.write("-" * 70 + "\n\n") for line_num, text, reason in issues: f.write(f"Line {line_num}: [{reason}]\n") f.write(f" {text}\n\n") f.write("-" * 70 + "\n") f.write("END OF REPORT\n") def main(): input_path = Path(INPUT_FILE) if not input_path.exists(): print(f"Error: {INPUT_FILE} not found.") return print(f"Reading {INPUT_FILE}...") text = load_text(input_path) print(f" {len(text)} characters, {len(text.splitlines())} lines.") print("Step 1: Fixing tilde-corrupted words...") text, n = fix_tildes(text) print(f" Fixed {n} occurrences.") print("Step 2: Fixing remaining OCR errors...") text, n = fix_ocr(text) print(f" Fixed {n} occurrences.") print("Step 3: Fixing known garbled passages...") text, n = fix_garbled_passages(text) print(f" Fixed {n} passages.") print("Step 4: Fixing remaining false ^{tags...")
text, n = fix_remaining_sups(text)
print(f" Fixed {n} occurrences.")

print("Step 5: Fixing escaped dollar signs...")
text, n = fix_escaped_dollars(text)
print(f" Fixed {n} occurrences.")

print(f"\nSaving cleaned file to {OUTPUT_FILE}...")
save_text(OUTPUT_FILE, text)

print("Step 6: Scanning for remaining issues...")
issues = find_remaining_issues(text)
print(f" Found {len(issues)} remaining issues.")

generate_report(issues, REPORT_FILE)

print(f"\n{'=' * 50}")
print("DONE!")
print(f" Cleaned file: {OUTPUT_FILE}")
print(f" Review report: {REPORT_FILE} ({len(issues)} items)")
print(f"{'=' * 50}")

if __name__ == "__main__":
main()}