#!/usr/bin/env python3
"""
Phase 2: Extract glossary candidates from cleaned text.
Finds proper names, institutions, and mathematical terms.
Outputs a YAML glossary template for manual review.
"""

import re
from collections import Counter
from pathlib import Path

INPUT_FILE = "book_cleaned2.md"
OUTPUT_FILE = "glossary_candidates.txt"

# Words to skip — common English words that happen to be capitalized
# at the start of sentences, or known non-name words
SKIP_WORDS = {
    'The', 'This', 'That', 'These', 'Those', 'There', 'Then', 'They',
    'Their', 'Them', 'Thus', 'Though', 'Through', 'Throughout',
    'What', 'When', 'Where', 'Which', 'While', 'Whether', 'Why', 'Who',
    'Whom', 'Whose',
    'And', 'But', 'For', 'Not', 'Nor', 'Now', 'Yet', 'Still', 'Also',
    'After', 'Again', 'Already', 'Always', 'Among', 'Another',
    'Any', 'Are', 'Away',
    'Back', 'Because', 'Before', 'Being', 'Between', 'Both', 'By',
    'Can', 'Could', 'Come',
    'Did', 'Does', 'Don', 'During',
    'Each', 'Either', 'Else', 'Even', 'Every', 'Eventually',
    'Far', 'Few', 'Finally', 'First', 'Five', 'Four', 'From',
    'Get', 'Give', 'God', 'Good', 'Got', 'Great',
    'Had', 'Has', 'Have', 'Having', 'Here', 'His', 'How', 'However',
    'If', 'In', 'Into', 'Is', 'It', 'Its',
    'Just',
    'Last', 'Let', 'Like', 'Long',
    'Made', 'Make', 'Many', 'May', 'Might', 'More', 'Most', 'Much',
    'Must', 'My',
    'Never', 'New', 'Next', 'No', 'None', 'Nothing',
    'Of', 'Off', 'Often', 'On', 'Once', 'One', 'Only', 'Or',
    'Other', 'Our', 'Out', 'Over',
    'Perhaps', 'Please', 'Prove',
    'Rather',
    'Said', 'Same', 'See', 'She', 'Should', 'Since', 'So', 'Some',
    'Someone', 'Something', 'Soon', 'Still', 'Such', 'Suppose',
    'Take', 'Than', 'Too', 'To', 'Two',
    'Under', 'Until', 'Up', 'Upon', 'Us',
    'Very',
    'Was', 'We', 'Well', 'Were', 'Will', 'With', 'Without',
    'Would',
    'You', 'Your',
    # Math-context words often capitalized
    'Consider', 'Given', 'Let', 'Suppose', 'Assume', 'Prove',
    'Note', 'Clearly', 'Obviously', 'Therefore', 'Hence',
    'QED', 'Proof',
    # Markdown artifacts
    'In', 'If', 'It', 'He', 'She', 'We', 'At', 'As', 'An', 'A',
    'I', 'My', 'No', 'So', 'Do', 'Be',
}


def extract_capitalized_phrases(text):
    """
    Extract sequences of capitalized words (proper nouns/names).
    Returns Counter of phrases.
    """
    # Find sequences of 1-4 capitalized words
    # Includes words with accents: Erdős, Pósa, etc.
    pattern = r'\b([A-ZÀ-Ž][a-zA-Zà-žéőöüáíó\'\.]+(?:\s+(?:de|von|van|di|du|le|la|the|of|for)\s+)?(?:[A-ZÀ-Ž][a-zA-Zà-žéőöüáíó\'\.]+)?(?:\s+[A-ZÀ-Ž][a-zA-Zà-žéőöüáíó\'\.]+){0,2})\b'

    matches = re.findall(pattern, text)

    # Filter and count
    counter = Counter()
    for phrase in matches:
        phrase = phrase.strip()
        # Skip if it's just a common word
        words = phrase.split()
        if len(words) == 1 and words[0] in SKIP_WORDS:
            continue
        # Skip very short
        if len(phrase) < 3:
            continue
        # Skip if starts with skip word and is only 2 words
        if len(words) <= 2 and words[0] in SKIP_WORDS:
            continue
        counter[phrase] += 1

    return counter


def extract_math_terms(text):
    """
    Extract mathematical terms — italicized phrases and known patterns.
    """
    # Find italicized terms (likely mathematical terminology)
    italic_pattern = r'\*([A-Za-z][A-Za-z\s]{2,40}?)\*'
    matches = re.findall(italic_pattern, text)

    counter = Counter()
    for term in matches:
        term = term.strip()
        if len(term) < 3:
            continue
        # Skip common italic uses (book titles tend to be longer)
        if term[0].isupper():
            counter[term] += 1
        elif term.islower():
            counter[term] += 1

    return counter


def extract_quoted_terms(text):
    """Extract terms in quotes that might be Erdős's special vocabulary."""
    pattern = r'"([a-z][a-z\s]{2,30}?)"'
    matches = re.findall(pattern, text)
    counter = Counter()
    for term in matches:
        term = term.strip()
        if len(term) > 3:
            counter[term] += 1
    return counter


def categorize_and_format(names, math_terms, quoted_terms):
    """Organize candidates into categories."""
    output = []

    output.append("=" * 70)
    output.append("GLOSSARY CANDIDATES")
    output.append("=" * 70)
    output.append("")
    output.append("Review this file and create glossary.yaml from it.")
    output.append("Items marked with [?] need your decision.")
    output.append("")

    # --- PEOPLE ---
    output.append("-" * 70)
    output.append("PEOPLE (names appearing 3+ times)")
    output.append("-" * 70)
    output.append("")

    people = {name: count for name, count in names.items()
              if count >= 3 and not any(w in name for w in
              ['University', 'College', 'Institute', 'Academy',
               'Society', 'Journal', 'Theorem', 'Prize',
               'Department', 'Laboratory', 'Labs'])}

    for name, count in sorted(people.items(), key=lambda x: -x[1]):
        output.append(f"  {name} ({count}x)")

    # --- INSTITUTIONS ---
    output.append("")
    output.append("-" * 70)
    output.append("INSTITUTIONS & PLACES (appearing 2+ times)")
    output.append("-" * 70)
    output.append("")

    institution_words = ['University', 'College', 'Institute', 'Academy',
                         'Society', 'Journal', 'Laboratory', 'Labs',
                         'Department', 'School', 'Museum', 'Library',
                         'Hospital', 'Church', 'Prize', 'Award',
                         'Foundation']

    institutions = {name: count for name, count in names.items()
                    if count >= 2 and any(w in name for w in institution_words)}

    for name, count in sorted(institutions.items(), key=lambda x: -x[1]):
        output.append(f"  {name} ({count}x)")

    # --- PLACES ---
    output.append("")
    output.append("-" * 70)
    output.append("OTHER PROPER NOUNS (appearing 3+ times)")
    output.append("  (places, nationalities, etc.)")
    output.append("-" * 70)
    output.append("")

    shown = set(people.keys()) | set(institutions.keys())
    other_proper = {name: count for name, count in names.items()
                    if count >= 3 and name not in shown}

    for name, count in sorted(other_proper.items(), key=lambda x: -x[1]):
        output.append(f"  {name} ({count}x)")

    # --- MATH TERMS ---
    output.append("")
    output.append("-" * 70)
    output.append("MATHEMATICAL TERMS (italicized, appearing 2+ times)")
    output.append("-" * 70)
    output.append("")

    math_filtered = {term: count for term, count in math_terms.items()
                     if count >= 2}

    for term, count in sorted(math_filtered.items(), key=lambda x: -x[1]):
        output.append(f"  {term} ({count}x)")

    # --- ERDŐS VOCABULARY ---
    output.append("")
    output.append("-" * 70)
    output.append("ERDŐS SPECIAL VOCABULARY (quoted terms, 2+ times)")
    output.append("-" * 70)
    output.append("")

    erdos_vocab = {term: count for term, count in quoted_terms.items()
                   if count >= 2}

    for term, count in sorted(erdos_vocab.items(), key=lambda x: -x[1]):
        output.append(f"  \"{term}\" ({count}x)")

    return '\n'.join(output)


def main():
    input_path = Path(INPUT_FILE)
    if not input_path.exists():
        print(f"Error: {INPUT_FILE} not found.")
        return

    print(f"Reading {INPUT_FILE}...")
    text = load_text(input_path)

    print("Extracting capitalized phrases (names, places, institutions)...")
    names = extract_capitalized_phrases(text)
    print(f"  Found {len(names)} unique capitalized phrases.")

    print("Extracting mathematical terms (italicized)...")
    math_terms = extract_math_terms(text)
    print(f"  Found {len(math_terms)} unique italicized terms.")

    print("Extracting quoted terms (Erdős vocabulary)...")
    quoted_terms = extract_quoted_terms(text)
    print(f"  Found {len(quoted_terms)} unique quoted terms.")

    print(f"\nFormatting and saving to {OUTPUT_FILE}...")
    output = categorize_and_format(names, math_terms, quoted_terms)
    save_text(OUTPUT_FILE, output)

    print(f"\n{'=' * 50}")
    print("DONE!")
    print(f"  Candidates file: {OUTPUT_FILE}")
    print(f"{'=' * 50}")
    print(f"\nNext steps:")
    print(f"  1. Review {OUTPUT_FILE}")
    print(f"  2. I'll help you build glossary.yaml from it")


def load_text(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()


def save_text(path, text):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)


if __name__ == "__main__":
    main()
