#!/usr/bin/env python3 """ Data Preparation Script for The Trial Literary Analysis SLM Downloads and processes source texts for training """ import json import os import re from pathlib import Path # from bs4 import BeautifulSoup # Not needed for basic functionality # import pandas as pd # Not needed for basic functionality from typing import Dict, List, Tuple import requests class TrialDataPrep: def __init__(self, base_dir: str = "data"): self.base_dir = Path(base_dir) self.raw_dir = self.base_dir / "raw" self.processed_dir = self.base_dir / "processed" self.training_dir = self.base_dir / "training" # Create directories for dir_path in [self.raw_dir, self.processed_dir, self.training_dir]: dir_path.mkdir(parents=True, exist_ok=True) def download_gutenberg_text(self) -> str | None: """Download The Trial from Project Gutenberg""" print("Downloading The Trial from Project Gutenberg...") # Project Gutenberg URL for The Trial by Franz Kafka url = "https://www.gutenberg.org/files/7849/7849-0.txt" try: response = requests.get(url) response.raise_for_status() text = response.text file_path = self.raw_dir / "the_trial_full.txt" with open(file_path, "w", encoding="utf-8") as f: f.write(text) print(f"Downloaded and saved to {file_path}") print(f"Text length: {len(text):,} characters") return str(file_path) except Exception as e: print(f"Error downloading text: {e}") return None def parse_chapters(self, text_file: str) -> List[Dict]: """Parse the full text into chapters""" print("Parsing chapters...") with open(text_file, "r", encoding="utf-8") as f: text = f.read() # Find chapter boundaries chapters = [] chapter_pattern = r"Chapter (One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\d+|\d+\-\d+)" # Split by chapter pattern parts = re.split(chapter_pattern, text) # Extract chapter titles and content chapter_matches = list(re.finditer(chapter_pattern, text)) for i, (part, match) in enumerate(zip(parts[1:], chapter_matches)): chapter_num = i + 1 chapter_title = match.group().strip() # Clean up the content content = part.strip() # Remove Gutenberg header/footer if present if "*** START OF" in content: content = content.split("*** START OF")[1] if "*** END OF" in content: content = content.split("*** END OF")[0] chapters.append( { "chapter_number": chapter_num, "title": chapter_title, "content": content.strip(), "word_count": len(content.split()), } ) print(f"Parsed {len(chapters)} chapters") # Save parsed chapters chapters_file = self.processed_dir / "chapters.json" with open(chapters_file, "w", encoding="utf-8") as f: json.dump(chapters, f, indent=2, ensure_ascii=False) return chapters def create_factual_qa_dataset(self, chapters: List[Dict]) -> List[Dict]: """Create factual Q&A pairs from the text""" print("Creating factual Q&A dataset...") qa_pairs = [] # Character-based questions characters = { "Josef K.": ["protagonist", "bank clerk", "arrested", "defendant"], "Frau Grubach": ["landlady", "lodging", "concerned", "witness"], "Fräulein Bürstner": ["neighbor", "tenant", "romantic interest", "confidante"], "Inspector": ["authority", "police", "interrogator", "bureaucrat"], "Uncle Albert": ["uncle", "lawyer", "family", "advisor"], "Leni": ["nurse", "court attendant", "seductress", "helper"], "Huld": ["lawyer", "advocate", "legal system", "professional"], "Titorelli": ["painter", "court informant", "opportunist", "corrupt"], } for character, keywords in characters.items(): # Create questions about each character questions = [ f"Who is {character}?", f"What is {character}'s role in the story?", f"Describe {character}'s character.", f"What happens to {character}?", f"Who is {character} related to?", ] for question in questions: # Find relevant chapters relevant_content = [] for chapter in chapters: if character.lower() in chapter["content"].lower(): relevant_content.append(chapter["content"]) if relevant_content: # Create a simple answer based on the content answer = f"{character} is a character in The Trial by Franz Kafka. " answer += f"Key information about {character} can be found throughout the novel." qa_pairs.append( { "instruction": question, "input": "", "output": answer, "category": "factual", "character": character, } ) # Plot-based questions plot_questions = [ ("Where does Josef K. get arrested?", "In his boarding house room"), ("What is Josef K.'s profession?", "Bank clerk"), ("Who is Josef K.'s landlady?", "Frau Grubach"), ("What crime is Josef K. accused of?", "It's never revealed"), ("Who is Josef K.'s first defender?", "Uncle Albert brings him Huld"), ("Where does the final execution take place?", "In a quarry"), ("What happens to Josef K. at the end?", "He is executed"), ] for question, answer in plot_questions: qa_pairs.append( { "instruction": question, "input": "", "output": answer, "category": "factual", "topic": "plot", } ) # Save factual Q&A dataset qa_file = self.training_dir / "factual_qa.json" with open(qa_file, "w", encoding="utf-8") as f: json.dump(qa_pairs, f, indent=2, ensure_ascii=False) print(f"Created {len(qa_pairs)} factual Q&A pairs") return qa_pairs def create_literary_analysis_dataset(self, chapters: List[Dict]) -> List[Dict]: """Create literary analysis examples""" print("Creating literary analysis dataset...") analysis_examples = [] # Theme analysis themes = { "Bureaucratic Absurdity": [ "The central theme driving the novel's surreal atmosphere", "Explored through the incomprehensible legal proceedings", "Questions the nature of law and authority", ], "Guilt and Innocence": [ "Contrast between actual guilt and presumed guilt", "Josef K.'s struggle to prove his innocence", "The impossibility of defending against unknown charges", ], "Alienation": [ "Josef K.'s isolation from society and support", "The indifference of others to his plight", "Existential loneliness in the face of absurdity", ], "Authority and Oppression": [ "The oppressive nature of anonymous authority", "Powerlessness of the individual against the system", "The psychological impact of institutional power", ], } for theme, descriptions in themes.items(): for desc in descriptions: analysis_examples.append( { "instruction": f"Analyze the theme of {theme} in The Trial.", "input": "", "output": f"{theme} is a major theme in The Trial. {desc}", "category": "analysis", "theme": theme, } ) # Symbolism analysis symbols = { "The Court": ["Obscure authority, labyrinthine bureaucracy, inaccessible justice"], "The Law": ["Abstract power, incomprehensible system, moral judgment"], "The Door": ["Barriers, access, blocked opportunities, judgment"], "The Cathedral": ["Spiritual emptiness, failed guidance, institutional corruption"], } for symbol, meanings in symbols.items(): analysis_examples.append( { "instruction": f"What does {symbol} symbolize in the novel?", "input": "", "output": f"{symbol} symbolizes {', '.join(meanings)} in The Trial.", "category": "analysis", "symbol": symbol, } ) # Save analysis dataset analysis_file = self.training_dir / "literary_analysis.json" with open(analysis_file, "w", encoding="utf-8") as f: json.dump(analysis_examples, f, indent=2, ensure_ascii=False) print(f"Created {len(analysis_examples)} literary analysis examples") return analysis_examples def create_creative_writing_dataset(self, chapters: List[Dict]) -> List[Dict]: """Create creative writing examples in Kafka's style""" print("Creating creative writing dataset...") creative_examples = [] # Style imitation prompts style_prompts = [ "Write a passage describing a character's anxiety about bureaucratic proceedings in Kafka's style.", "Create a dialogue between two characters discussing incomprehensible legal matters.", "Describe a surreal scene in the absurdist, nightmarish style.", "Write a passage about the psychological impact of institutional power.", "Create a scene showing the contrast between individual and anonymous authority.", ] for prompt in style_prompts: creative_examples.append( { "instruction": prompt, "input": "", "output": "This would demonstrate the absurdist, nightmarish style characteristic of Kafka, with precise language and psychological depth.", "category": "creative", "style": "kafka", } ) # Save creative writing dataset creative_file = self.training_dir / "creative_writing.json" with open(creative_file, "w", encoding="utf-8") as f: json.dump(creative_examples, f, indent=2, ensure_ascii=False) print(f"Created {len(creative_examples)} creative writing examples") return creative_examples def create_combined_dataset(self) -> str: """Combine all datasets into training format""" print("Creating combined training dataset...") # Load all datasets datasets = {} dataset_files = { "factual": "factual_qa.json", "analysis": "literary_analysis.json", "creative": "creative_writing.json", } for name, filename in dataset_files.items(): file_path = self.training_dir / filename if file_path.exists(): with open(file_path, "r", encoding="utf-8") as f: datasets[name] = json.load(f) # Combine all examples combined_examples = [] for category, examples in datasets.items(): combined_examples.extend(examples) # Save combined dataset combined_file = self.training_dir / "the_trial_combined.json" with open(combined_file, "w", encoding="utf-8") as f: json.dump(combined_examples, f, indent=2, ensure_ascii=False) print(f"Created combined dataset with {len(combined_examples)} examples") # Create dataset statistics stats = { "total_examples": len(combined_examples), "categories": {name: len(examples) for name, examples in datasets.items()}, "estimated_tokens": len(combined_examples) * 150, # Rough estimate } stats_file = self.training_dir / "dataset_stats.json" with open(stats_file, "w", encoding="utf-8") as f: json.dump(stats, f, indent=2, ensure_ascii=False) print(f"Dataset statistics: {stats}") return str(combined_file) def main(): """Main execution function""" prep = TrialDataPrep() # Step 1: Download source text text_file = prep.download_gutenberg_text() if not text_file: print("Failed to download source text") return # Step 2: Parse chapters chapters = prep.parse_chapters(text_file) # Step 3: Create training datasets factual_qa = prep.create_factual_qa_dataset(chapters) literary_analysis = prep.create_literary_analysis_dataset(chapters) creative_writing = prep.create_creative_writing_dataset(chapters) # Step 4: Create combined dataset combined_file = prep.create_combined_dataset() print("\n" + "=" * 50) print("Data Preparation Complete!") print("=" * 50) print(f"Chapters parsed: {len(chapters)}") print(f"Factual Q&A pairs: {len(factual_qa)}") print(f"Literary analysis examples: {len(literary_analysis)}") print(f"Creative writing examples: {len(creative_writing)}") print(f"Combined dataset: {combined_file}") print("=" * 50) if __name__ == "__main__": main()