the-trial/scripts/data_preparation.py

#!/usr/bin/env python3
"""
Data Preparation Script for The Trial Literary Analysis SLM
Downloads and processes source texts for training
"""

import json
import os
import re
from pathlib import Path

# from bs4 import BeautifulSoup  # Not needed for basic functionality
# import pandas as pd  # Not needed for basic functionality
from typing import Dict, List, Tuple

import requests


class TrialDataPrep:
    def __init__(self, base_dir: str = "data"):
        self.base_dir = Path(base_dir)
        self.raw_dir = self.base_dir / "raw"
        self.processed_dir = self.base_dir / "processed"
        self.training_dir = self.base_dir / "training"

        # Create directories
        for dir_path in [self.raw_dir, self.processed_dir, self.training_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)

    def download_gutenberg_text(self) -> str | None:
        """Download The Trial from Project Gutenberg"""
        print("Downloading The Trial from Project Gutenberg...")

        # Project Gutenberg URL for The Trial by Franz Kafka
        url = "https://www.gutenberg.org/files/7849/7849-0.txt"

        try:
            response = requests.get(url)
            response.raise_for_status()

            text = response.text
            file_path = self.raw_dir / "the_trial_full.txt"

            with open(file_path, "w", encoding="utf-8") as f:
                f.write(text)

            print(f"Downloaded and saved to {file_path}")
            print(f"Text length: {len(text):,} characters")

            return str(file_path)

        except Exception as e:
            print(f"Error downloading text: {e}")
            return None

    def parse_chapters(self, text_file: str) -> List[Dict]:
        """Parse the full text into chapters"""
        print("Parsing chapters...")

        with open(text_file, "r", encoding="utf-8") as f:
            text = f.read()

        # Find chapter boundaries
        chapters = []
        chapter_pattern = r"Chapter (One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\d+|\d+\-\d+)"

        # Split by chapter pattern
        parts = re.split(chapter_pattern, text)

        # Extract chapter titles and content
        chapter_matches = list(re.finditer(chapter_pattern, text))

        for i, (part, match) in enumerate(zip(parts[1:], chapter_matches)):
            chapter_num = i + 1
            chapter_title = match.group().strip()

            # Clean up the content
            content = part.strip()

            # Remove Gutenberg header/footer if present
            if "*** START OF" in content:
                content = content.split("*** START OF")[1]
            if "*** END OF" in content:
                content = content.split("*** END OF")[0]

            chapters.append(
                {
                    "chapter_number": chapter_num,
                    "title": chapter_title,
                    "content": content.strip(),
                    "word_count": len(content.split()),
                }
            )

        print(f"Parsed {len(chapters)} chapters")

        # Save parsed chapters
        chapters_file = self.processed_dir / "chapters.json"
        with open(chapters_file, "w", encoding="utf-8") as f:
            json.dump(chapters, f, indent=2, ensure_ascii=False)

        return chapters

    def create_factual_qa_dataset(self, chapters: List[Dict]) -> List[Dict]:
        """Create factual Q&A pairs from the text"""
        print("Creating factual Q&A dataset...")

        qa_pairs = []

        # Character-based questions
        characters = {
            "Josef K.": ["protagonist", "bank clerk", "arrested", "defendant"],
            "Frau Grubach": ["landlady", "lodging", "concerned", "witness"],
            "Fräulein Bürstner": ["neighbor", "tenant", "romantic interest", "confidante"],
            "Inspector": ["authority", "police", "interrogator", "bureaucrat"],
            "Uncle Albert": ["uncle", "lawyer", "family", "advisor"],
            "Leni": ["nurse", "court attendant", "seductress", "helper"],
            "Huld": ["lawyer", "advocate", "legal system", "professional"],
            "Titorelli": ["painter", "court informant", "opportunist", "corrupt"],
        }

        for character, keywords in characters.items():
            # Create questions about each character
            questions = [
                f"Who is {character}?",
                f"What is {character}'s role in the story?",
                f"Describe {character}'s character.",
                f"What happens to {character}?",
                f"Who is {character} related to?",
            ]

            for question in questions:
                # Find relevant chapters
                relevant_content = []
                for chapter in chapters:
                    if character.lower() in chapter["content"].lower():
                        relevant_content.append(chapter["content"])

                if relevant_content:
                    # Create a simple answer based on the content
                    answer = f"{character} is a character in The Trial by Franz Kafka. "
                    answer += f"Key information about {character} can be found throughout the novel."

                    qa_pairs.append(
                        {
                            "instruction": question,
                            "input": "",
                            "output": answer,
                            "category": "factual",
                            "character": character,
                        }
                    )

        # Plot-based questions
        plot_questions = [
            ("Where does Josef K. get arrested?", "In his boarding house room"),
            ("What is Josef K.'s profession?", "Bank clerk"),
            ("Who is Josef K.'s landlady?", "Frau Grubach"),
            ("What crime is Josef K. accused of?", "It's never revealed"),
            ("Who is Josef K.'s first defender?", "Uncle Albert brings him Huld"),
            ("Where does the final execution take place?", "In a quarry"),
            ("What happens to Josef K. at the end?", "He is executed"),
        ]

        for question, answer in plot_questions:
            qa_pairs.append(
                {
                    "instruction": question,
                    "input": "",
                    "output": answer,
                    "category": "factual",
                    "topic": "plot",
                }
            )

        # Save factual Q&A dataset
        qa_file = self.training_dir / "factual_qa.json"
        with open(qa_file, "w", encoding="utf-8") as f:
            json.dump(qa_pairs, f, indent=2, ensure_ascii=False)

        print(f"Created {len(qa_pairs)} factual Q&A pairs")
        return qa_pairs

    def create_literary_analysis_dataset(self, chapters: List[Dict]) -> List[Dict]:
        """Create literary analysis examples"""
        print("Creating literary analysis dataset...")

        analysis_examples = []

        # Theme analysis
        themes = {
            "Bureaucratic Absurdity": [
                "The central theme driving the novel's surreal atmosphere",
                "Explored through the incomprehensible legal proceedings",
                "Questions the nature of law and authority",
            ],
            "Guilt and Innocence": [
                "Contrast between actual guilt and presumed guilt",
                "Josef K.'s struggle to prove his innocence",
                "The impossibility of defending against unknown charges",
            ],
            "Alienation": [
                "Josef K.'s isolation from society and support",
                "The indifference of others to his plight",
                "Existential loneliness in the face of absurdity",
            ],
            "Authority and Oppression": [
                "The oppressive nature of anonymous authority",
                "Powerlessness of the individual against the system",
                "The psychological impact of institutional power",
            ],
        }

        for theme, descriptions in themes.items():
            for desc in descriptions:
                analysis_examples.append(
                    {
                        "instruction": f"Analyze the theme of {theme} in The Trial.",
                        "input": "",
                        "output": f"{theme} is a major theme in The Trial. {desc}",
                        "category": "analysis",
                        "theme": theme,
                    }
                )

        # Symbolism analysis
        symbols = {
            "The Court": ["Obscure authority, labyrinthine bureaucracy, inaccessible justice"],
            "The Law": ["Abstract power, incomprehensible system, moral judgment"],
            "The Door": ["Barriers, access, blocked opportunities, judgment"],
            "The Cathedral": ["Spiritual emptiness, failed guidance, institutional corruption"],
        }

        for symbol, meanings in symbols.items():
            analysis_examples.append(
                {
                    "instruction": f"What does {symbol} symbolize in the novel?",
                    "input": "",
                    "output": f"{symbol} symbolizes {', '.join(meanings)} in The Trial.",
                    "category": "analysis",
                    "symbol": symbol,
                }
            )

        # Save analysis dataset
        analysis_file = self.training_dir / "literary_analysis.json"
        with open(analysis_file, "w", encoding="utf-8") as f:
            json.dump(analysis_examples, f, indent=2, ensure_ascii=False)

        print(f"Created {len(analysis_examples)} literary analysis examples")
        return analysis_examples

    def create_creative_writing_dataset(self, chapters: List[Dict]) -> List[Dict]:
        """Create creative writing examples in Kafka's style"""
        print("Creating creative writing dataset...")

        creative_examples = []

        # Style imitation prompts
        style_prompts = [
            "Write a passage describing a character's anxiety about bureaucratic proceedings in Kafka's style.",
            "Create a dialogue between two characters discussing incomprehensible legal matters.",
            "Describe a surreal scene in the absurdist, nightmarish style.",
            "Write a passage about the psychological impact of institutional power.",
            "Create a scene showing the contrast between individual and anonymous authority.",
        ]

        for prompt in style_prompts:
            creative_examples.append(
                {
                    "instruction": prompt,
                    "input": "",
                    "output": "This would demonstrate the absurdist, nightmarish style characteristic of Kafka, with precise language and psychological depth.",
                    "category": "creative",
                    "style": "kafka",
                }
            )

        # Save creative writing dataset
        creative_file = self.training_dir / "creative_writing.json"
        with open(creative_file, "w", encoding="utf-8") as f:
            json.dump(creative_examples, f, indent=2, ensure_ascii=False)

        print(f"Created {len(creative_examples)} creative writing examples")
        return creative_examples

    def create_combined_dataset(self) -> str:
        """Combine all datasets into training format"""
        print("Creating combined training dataset...")

        # Load all datasets
        datasets = {}
        dataset_files = {
            "factual": "factual_qa.json",
            "analysis": "literary_analysis.json",
            "creative": "creative_writing.json",
        }

        for name, filename in dataset_files.items():
            file_path = self.training_dir / filename
            if file_path.exists():
                with open(file_path, "r", encoding="utf-8") as f:
                    datasets[name] = json.load(f)

        # Combine all examples
        combined_examples = []
        for category, examples in datasets.items():
            combined_examples.extend(examples)

        # Save combined dataset
        combined_file = self.training_dir / "the_trial_combined.json"
        with open(combined_file, "w", encoding="utf-8") as f:
            json.dump(combined_examples, f, indent=2, ensure_ascii=False)

        print(f"Created combined dataset with {len(combined_examples)} examples")

        # Create dataset statistics
        stats = {
            "total_examples": len(combined_examples),
            "categories": {name: len(examples) for name, examples in datasets.items()},
            "estimated_tokens": len(combined_examples) * 150,  # Rough estimate
        }

        stats_file = self.training_dir / "dataset_stats.json"
        with open(stats_file, "w", encoding="utf-8") as f:
            json.dump(stats, f, indent=2, ensure_ascii=False)

        print(f"Dataset statistics: {stats}")

        return str(combined_file)


def main():
    """Main execution function"""
    prep = TrialDataPrep()

    # Step 1: Download source text
    text_file = prep.download_gutenberg_text()
    if not text_file:
        print("Failed to download source text")
        return

    # Step 2: Parse chapters
    chapters = prep.parse_chapters(text_file)

    # Step 3: Create training datasets
    factual_qa = prep.create_factual_qa_dataset(chapters)
    literary_analysis = prep.create_literary_analysis_dataset(chapters)
    creative_writing = prep.create_creative_writing_dataset(chapters)

    # Step 4: Create combined dataset
    combined_file = prep.create_combined_dataset()

    print("\n" + "=" * 50)
    print("Data Preparation Complete!")
    print("=" * 50)
    print(f"Chapters parsed: {len(chapters)}")
    print(f"Factual Q&A pairs: {len(factual_qa)}")
    print(f"Literary analysis examples: {len(literary_analysis)}")
    print(f"Creative writing examples: {len(creative_writing)}")
    print(f"Combined dataset: {combined_file}")
    print("=" * 50)


if __name__ == "__main__":
    main()