Files
the-trial/scripts/data_preparation.py

367 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Data Preparation Script for The Trial Literary Analysis SLM
Downloads and processes source texts for training
"""
import json
import os
import re
from pathlib import Path
# from bs4 import BeautifulSoup # Not needed for basic functionality
# import pandas as pd # Not needed for basic functionality
from typing import Dict, List, Tuple
import requests
class TrialDataPrep:
def __init__(self, base_dir: str = "data"):
self.base_dir = Path(base_dir)
self.raw_dir = self.base_dir / "raw"
self.processed_dir = self.base_dir / "processed"
self.training_dir = self.base_dir / "training"
# Create directories
for dir_path in [self.raw_dir, self.processed_dir, self.training_dir]:
dir_path.mkdir(parents=True, exist_ok=True)
def download_gutenberg_text(self) -> str | None:
"""Download The Trial from Project Gutenberg"""
print("Downloading The Trial from Project Gutenberg...")
# Project Gutenberg URL for The Trial by Franz Kafka
url = "https://www.gutenberg.org/files/7849/7849-0.txt"
try:
response = requests.get(url)
response.raise_for_status()
text = response.text
file_path = self.raw_dir / "the_trial_full.txt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Downloaded and saved to {file_path}")
print(f"Text length: {len(text):,} characters")
return str(file_path)
except Exception as e:
print(f"Error downloading text: {e}")
return None
def parse_chapters(self, text_file: str) -> List[Dict]:
"""Parse the full text into chapters"""
print("Parsing chapters...")
with open(text_file, "r", encoding="utf-8") as f:
text = f.read()
# Find chapter boundaries
chapters = []
chapter_pattern = r"Chapter (One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\d+|\d+\-\d+)"
# Split by chapter pattern
parts = re.split(chapter_pattern, text)
# Extract chapter titles and content
chapter_matches = list(re.finditer(chapter_pattern, text))
for i, (part, match) in enumerate(zip(parts[1:], chapter_matches)):
chapter_num = i + 1
chapter_title = match.group().strip()
# Clean up the content
content = part.strip()
# Remove Gutenberg header/footer if present
if "*** START OF" in content:
content = content.split("*** START OF")[1]
if "*** END OF" in content:
content = content.split("*** END OF")[0]
chapters.append(
{
"chapter_number": chapter_num,
"title": chapter_title,
"content": content.strip(),
"word_count": len(content.split()),
}
)
print(f"Parsed {len(chapters)} chapters")
# Save parsed chapters
chapters_file = self.processed_dir / "chapters.json"
with open(chapters_file, "w", encoding="utf-8") as f:
json.dump(chapters, f, indent=2, ensure_ascii=False)
return chapters
def create_factual_qa_dataset(self, chapters: List[Dict]) -> List[Dict]:
"""Create factual Q&A pairs from the text"""
print("Creating factual Q&A dataset...")
qa_pairs = []
# Character-based questions
characters = {
"Josef K.": ["protagonist", "bank clerk", "arrested", "defendant"],
"Frau Grubach": ["landlady", "lodging", "concerned", "witness"],
"Fräulein Bürstner": ["neighbor", "tenant", "romantic interest", "confidante"],
"Inspector": ["authority", "police", "interrogator", "bureaucrat"],
"Uncle Albert": ["uncle", "lawyer", "family", "advisor"],
"Leni": ["nurse", "court attendant", "seductress", "helper"],
"Huld": ["lawyer", "advocate", "legal system", "professional"],
"Titorelli": ["painter", "court informant", "opportunist", "corrupt"],
}
for character, keywords in characters.items():
# Create questions about each character
questions = [
f"Who is {character}?",
f"What is {character}'s role in the story?",
f"Describe {character}'s character.",
f"What happens to {character}?",
f"Who is {character} related to?",
]
for question in questions:
# Find relevant chapters
relevant_content = []
for chapter in chapters:
if character.lower() in chapter["content"].lower():
relevant_content.append(chapter["content"])
if relevant_content:
# Create a simple answer based on the content
answer = f"{character} is a character in The Trial by Franz Kafka. "
answer += f"Key information about {character} can be found throughout the novel."
qa_pairs.append(
{
"instruction": question,
"input": "",
"output": answer,
"category": "factual",
"character": character,
}
)
# Plot-based questions
plot_questions = [
("Where does Josef K. get arrested?", "In his boarding house room"),
("What is Josef K.'s profession?", "Bank clerk"),
("Who is Josef K.'s landlady?", "Frau Grubach"),
("What crime is Josef K. accused of?", "It's never revealed"),
("Who is Josef K.'s first defender?", "Uncle Albert brings him Huld"),
("Where does the final execution take place?", "In a quarry"),
("What happens to Josef K. at the end?", "He is executed"),
]
for question, answer in plot_questions:
qa_pairs.append(
{
"instruction": question,
"input": "",
"output": answer,
"category": "factual",
"topic": "plot",
}
)
# Save factual Q&A dataset
qa_file = self.training_dir / "factual_qa.json"
with open(qa_file, "w", encoding="utf-8") as f:
json.dump(qa_pairs, f, indent=2, ensure_ascii=False)
print(f"Created {len(qa_pairs)} factual Q&A pairs")
return qa_pairs
def create_literary_analysis_dataset(self, chapters: List[Dict]) -> List[Dict]:
"""Create literary analysis examples"""
print("Creating literary analysis dataset...")
analysis_examples = []
# Theme analysis
themes = {
"Bureaucratic Absurdity": [
"The central theme driving the novel's surreal atmosphere",
"Explored through the incomprehensible legal proceedings",
"Questions the nature of law and authority",
],
"Guilt and Innocence": [
"Contrast between actual guilt and presumed guilt",
"Josef K.'s struggle to prove his innocence",
"The impossibility of defending against unknown charges",
],
"Alienation": [
"Josef K.'s isolation from society and support",
"The indifference of others to his plight",
"Existential loneliness in the face of absurdity",
],
"Authority and Oppression": [
"The oppressive nature of anonymous authority",
"Powerlessness of the individual against the system",
"The psychological impact of institutional power",
],
}
for theme, descriptions in themes.items():
for desc in descriptions:
analysis_examples.append(
{
"instruction": f"Analyze the theme of {theme} in The Trial.",
"input": "",
"output": f"{theme} is a major theme in The Trial. {desc}",
"category": "analysis",
"theme": theme,
}
)
# Symbolism analysis
symbols = {
"The Court": ["Obscure authority, labyrinthine bureaucracy, inaccessible justice"],
"The Law": ["Abstract power, incomprehensible system, moral judgment"],
"The Door": ["Barriers, access, blocked opportunities, judgment"],
"The Cathedral": ["Spiritual emptiness, failed guidance, institutional corruption"],
}
for symbol, meanings in symbols.items():
analysis_examples.append(
{
"instruction": f"What does {symbol} symbolize in the novel?",
"input": "",
"output": f"{symbol} symbolizes {', '.join(meanings)} in The Trial.",
"category": "analysis",
"symbol": symbol,
}
)
# Save analysis dataset
analysis_file = self.training_dir / "literary_analysis.json"
with open(analysis_file, "w", encoding="utf-8") as f:
json.dump(analysis_examples, f, indent=2, ensure_ascii=False)
print(f"Created {len(analysis_examples)} literary analysis examples")
return analysis_examples
def create_creative_writing_dataset(self, chapters: List[Dict]) -> List[Dict]:
"""Create creative writing examples in Kafka's style"""
print("Creating creative writing dataset...")
creative_examples = []
# Style imitation prompts
style_prompts = [
"Write a passage describing a character's anxiety about bureaucratic proceedings in Kafka's style.",
"Create a dialogue between two characters discussing incomprehensible legal matters.",
"Describe a surreal scene in the absurdist, nightmarish style.",
"Write a passage about the psychological impact of institutional power.",
"Create a scene showing the contrast between individual and anonymous authority.",
]
for prompt in style_prompts:
creative_examples.append(
{
"instruction": prompt,
"input": "",
"output": "This would demonstrate the absurdist, nightmarish style characteristic of Kafka, with precise language and psychological depth.",
"category": "creative",
"style": "kafka",
}
)
# Save creative writing dataset
creative_file = self.training_dir / "creative_writing.json"
with open(creative_file, "w", encoding="utf-8") as f:
json.dump(creative_examples, f, indent=2, ensure_ascii=False)
print(f"Created {len(creative_examples)} creative writing examples")
return creative_examples
def create_combined_dataset(self) -> str:
"""Combine all datasets into training format"""
print("Creating combined training dataset...")
# Load all datasets
datasets = {}
dataset_files = {
"factual": "factual_qa.json",
"analysis": "literary_analysis.json",
"creative": "creative_writing.json",
}
for name, filename in dataset_files.items():
file_path = self.training_dir / filename
if file_path.exists():
with open(file_path, "r", encoding="utf-8") as f:
datasets[name] = json.load(f)
# Combine all examples
combined_examples = []
for category, examples in datasets.items():
combined_examples.extend(examples)
# Save combined dataset
combined_file = self.training_dir / "the_trial_combined.json"
with open(combined_file, "w", encoding="utf-8") as f:
json.dump(combined_examples, f, indent=2, ensure_ascii=False)
print(f"Created combined dataset with {len(combined_examples)} examples")
# Create dataset statistics
stats = {
"total_examples": len(combined_examples),
"categories": {name: len(examples) for name, examples in datasets.items()},
"estimated_tokens": len(combined_examples) * 150, # Rough estimate
}
stats_file = self.training_dir / "dataset_stats.json"
with open(stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"Dataset statistics: {stats}")
return str(combined_file)
def main():
"""Main execution function"""
prep = TrialDataPrep()
# Step 1: Download source text
text_file = prep.download_gutenberg_text()
if not text_file:
print("Failed to download source text")
return
# Step 2: Parse chapters
chapters = prep.parse_chapters(text_file)
# Step 3: Create training datasets
factual_qa = prep.create_factual_qa_dataset(chapters)
literary_analysis = prep.create_literary_analysis_dataset(chapters)
creative_writing = prep.create_creative_writing_dataset(chapters)
# Step 4: Create combined dataset
combined_file = prep.create_combined_dataset()
print("\n" + "=" * 50)
print("Data Preparation Complete!")
print("=" * 50)
print(f"Chapters parsed: {len(chapters)}")
print(f"Factual Q&A pairs: {len(factual_qa)}")
print(f"Literary analysis examples: {len(literary_analysis)}")
print(f"Creative writing examples: {len(creative_writing)}")
print(f"Combined dataset: {combined_file}")
print("=" * 50)
if __name__ == "__main__":
main()