367 lines
14 KiB
Python
367 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Data Preparation Script for The Trial Literary Analysis SLM
|
|
Downloads and processes source texts for training
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
# from bs4 import BeautifulSoup # Not needed for basic functionality
|
|
# import pandas as pd # Not needed for basic functionality
|
|
from typing import Dict, List, Tuple
|
|
|
|
import requests
|
|
|
|
|
|
class TrialDataPrep:
|
|
def __init__(self, base_dir: str = "data"):
|
|
self.base_dir = Path(base_dir)
|
|
self.raw_dir = self.base_dir / "raw"
|
|
self.processed_dir = self.base_dir / "processed"
|
|
self.training_dir = self.base_dir / "training"
|
|
|
|
# Create directories
|
|
for dir_path in [self.raw_dir, self.processed_dir, self.training_dir]:
|
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
def download_gutenberg_text(self) -> str | None:
|
|
"""Download The Trial from Project Gutenberg"""
|
|
print("Downloading The Trial from Project Gutenberg...")
|
|
|
|
# Project Gutenberg URL for The Trial by Franz Kafka
|
|
url = "https://www.gutenberg.org/files/7849/7849-0.txt"
|
|
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
text = response.text
|
|
file_path = self.raw_dir / "the_trial_full.txt"
|
|
|
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
f.write(text)
|
|
|
|
print(f"Downloaded and saved to {file_path}")
|
|
print(f"Text length: {len(text):,} characters")
|
|
|
|
return str(file_path)
|
|
|
|
except Exception as e:
|
|
print(f"Error downloading text: {e}")
|
|
return None
|
|
|
|
def parse_chapters(self, text_file: str) -> List[Dict]:
|
|
"""Parse the full text into chapters"""
|
|
print("Parsing chapters...")
|
|
|
|
with open(text_file, "r", encoding="utf-8") as f:
|
|
text = f.read()
|
|
|
|
# Find chapter boundaries
|
|
chapters = []
|
|
chapter_pattern = r"Chapter (One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\d+|\d+\-\d+)"
|
|
|
|
# Split by chapter pattern
|
|
parts = re.split(chapter_pattern, text)
|
|
|
|
# Extract chapter titles and content
|
|
chapter_matches = list(re.finditer(chapter_pattern, text))
|
|
|
|
for i, (part, match) in enumerate(zip(parts[1:], chapter_matches)):
|
|
chapter_num = i + 1
|
|
chapter_title = match.group().strip()
|
|
|
|
# Clean up the content
|
|
content = part.strip()
|
|
|
|
# Remove Gutenberg header/footer if present
|
|
if "*** START OF" in content:
|
|
content = content.split("*** START OF")[1]
|
|
if "*** END OF" in content:
|
|
content = content.split("*** END OF")[0]
|
|
|
|
chapters.append(
|
|
{
|
|
"chapter_number": chapter_num,
|
|
"title": chapter_title,
|
|
"content": content.strip(),
|
|
"word_count": len(content.split()),
|
|
}
|
|
)
|
|
|
|
print(f"Parsed {len(chapters)} chapters")
|
|
|
|
# Save parsed chapters
|
|
chapters_file = self.processed_dir / "chapters.json"
|
|
with open(chapters_file, "w", encoding="utf-8") as f:
|
|
json.dump(chapters, f, indent=2, ensure_ascii=False)
|
|
|
|
return chapters
|
|
|
|
def create_factual_qa_dataset(self, chapters: List[Dict]) -> List[Dict]:
|
|
"""Create factual Q&A pairs from the text"""
|
|
print("Creating factual Q&A dataset...")
|
|
|
|
qa_pairs = []
|
|
|
|
# Character-based questions
|
|
characters = {
|
|
"Josef K.": ["protagonist", "bank clerk", "arrested", "defendant"],
|
|
"Frau Grubach": ["landlady", "lodging", "concerned", "witness"],
|
|
"Fräulein Bürstner": ["neighbor", "tenant", "romantic interest", "confidante"],
|
|
"Inspector": ["authority", "police", "interrogator", "bureaucrat"],
|
|
"Uncle Albert": ["uncle", "lawyer", "family", "advisor"],
|
|
"Leni": ["nurse", "court attendant", "seductress", "helper"],
|
|
"Huld": ["lawyer", "advocate", "legal system", "professional"],
|
|
"Titorelli": ["painter", "court informant", "opportunist", "corrupt"],
|
|
}
|
|
|
|
for character, keywords in characters.items():
|
|
# Create questions about each character
|
|
questions = [
|
|
f"Who is {character}?",
|
|
f"What is {character}'s role in the story?",
|
|
f"Describe {character}'s character.",
|
|
f"What happens to {character}?",
|
|
f"Who is {character} related to?",
|
|
]
|
|
|
|
for question in questions:
|
|
# Find relevant chapters
|
|
relevant_content = []
|
|
for chapter in chapters:
|
|
if character.lower() in chapter["content"].lower():
|
|
relevant_content.append(chapter["content"])
|
|
|
|
if relevant_content:
|
|
# Create a simple answer based on the content
|
|
answer = f"{character} is a character in The Trial by Franz Kafka. "
|
|
answer += f"Key information about {character} can be found throughout the novel."
|
|
|
|
qa_pairs.append(
|
|
{
|
|
"instruction": question,
|
|
"input": "",
|
|
"output": answer,
|
|
"category": "factual",
|
|
"character": character,
|
|
}
|
|
)
|
|
|
|
# Plot-based questions
|
|
plot_questions = [
|
|
("Where does Josef K. get arrested?", "In his boarding house room"),
|
|
("What is Josef K.'s profession?", "Bank clerk"),
|
|
("Who is Josef K.'s landlady?", "Frau Grubach"),
|
|
("What crime is Josef K. accused of?", "It's never revealed"),
|
|
("Who is Josef K.'s first defender?", "Uncle Albert brings him Huld"),
|
|
("Where does the final execution take place?", "In a quarry"),
|
|
("What happens to Josef K. at the end?", "He is executed"),
|
|
]
|
|
|
|
for question, answer in plot_questions:
|
|
qa_pairs.append(
|
|
{
|
|
"instruction": question,
|
|
"input": "",
|
|
"output": answer,
|
|
"category": "factual",
|
|
"topic": "plot",
|
|
}
|
|
)
|
|
|
|
# Save factual Q&A dataset
|
|
qa_file = self.training_dir / "factual_qa.json"
|
|
with open(qa_file, "w", encoding="utf-8") as f:
|
|
json.dump(qa_pairs, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Created {len(qa_pairs)} factual Q&A pairs")
|
|
return qa_pairs
|
|
|
|
def create_literary_analysis_dataset(self, chapters: List[Dict]) -> List[Dict]:
|
|
"""Create literary analysis examples"""
|
|
print("Creating literary analysis dataset...")
|
|
|
|
analysis_examples = []
|
|
|
|
# Theme analysis
|
|
themes = {
|
|
"Bureaucratic Absurdity": [
|
|
"The central theme driving the novel's surreal atmosphere",
|
|
"Explored through the incomprehensible legal proceedings",
|
|
"Questions the nature of law and authority",
|
|
],
|
|
"Guilt and Innocence": [
|
|
"Contrast between actual guilt and presumed guilt",
|
|
"Josef K.'s struggle to prove his innocence",
|
|
"The impossibility of defending against unknown charges",
|
|
],
|
|
"Alienation": [
|
|
"Josef K.'s isolation from society and support",
|
|
"The indifference of others to his plight",
|
|
"Existential loneliness in the face of absurdity",
|
|
],
|
|
"Authority and Oppression": [
|
|
"The oppressive nature of anonymous authority",
|
|
"Powerlessness of the individual against the system",
|
|
"The psychological impact of institutional power",
|
|
],
|
|
}
|
|
|
|
for theme, descriptions in themes.items():
|
|
for desc in descriptions:
|
|
analysis_examples.append(
|
|
{
|
|
"instruction": f"Analyze the theme of {theme} in The Trial.",
|
|
"input": "",
|
|
"output": f"{theme} is a major theme in The Trial. {desc}",
|
|
"category": "analysis",
|
|
"theme": theme,
|
|
}
|
|
)
|
|
|
|
# Symbolism analysis
|
|
symbols = {
|
|
"The Court": ["Obscure authority, labyrinthine bureaucracy, inaccessible justice"],
|
|
"The Law": ["Abstract power, incomprehensible system, moral judgment"],
|
|
"The Door": ["Barriers, access, blocked opportunities, judgment"],
|
|
"The Cathedral": ["Spiritual emptiness, failed guidance, institutional corruption"],
|
|
}
|
|
|
|
for symbol, meanings in symbols.items():
|
|
analysis_examples.append(
|
|
{
|
|
"instruction": f"What does {symbol} symbolize in the novel?",
|
|
"input": "",
|
|
"output": f"{symbol} symbolizes {', '.join(meanings)} in The Trial.",
|
|
"category": "analysis",
|
|
"symbol": symbol,
|
|
}
|
|
)
|
|
|
|
# Save analysis dataset
|
|
analysis_file = self.training_dir / "literary_analysis.json"
|
|
with open(analysis_file, "w", encoding="utf-8") as f:
|
|
json.dump(analysis_examples, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Created {len(analysis_examples)} literary analysis examples")
|
|
return analysis_examples
|
|
|
|
def create_creative_writing_dataset(self, chapters: List[Dict]) -> List[Dict]:
|
|
"""Create creative writing examples in Kafka's style"""
|
|
print("Creating creative writing dataset...")
|
|
|
|
creative_examples = []
|
|
|
|
# Style imitation prompts
|
|
style_prompts = [
|
|
"Write a passage describing a character's anxiety about bureaucratic proceedings in Kafka's style.",
|
|
"Create a dialogue between two characters discussing incomprehensible legal matters.",
|
|
"Describe a surreal scene in the absurdist, nightmarish style.",
|
|
"Write a passage about the psychological impact of institutional power.",
|
|
"Create a scene showing the contrast between individual and anonymous authority.",
|
|
]
|
|
|
|
for prompt in style_prompts:
|
|
creative_examples.append(
|
|
{
|
|
"instruction": prompt,
|
|
"input": "",
|
|
"output": "This would demonstrate the absurdist, nightmarish style characteristic of Kafka, with precise language and psychological depth.",
|
|
"category": "creative",
|
|
"style": "kafka",
|
|
}
|
|
)
|
|
|
|
# Save creative writing dataset
|
|
creative_file = self.training_dir / "creative_writing.json"
|
|
with open(creative_file, "w", encoding="utf-8") as f:
|
|
json.dump(creative_examples, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Created {len(creative_examples)} creative writing examples")
|
|
return creative_examples
|
|
|
|
def create_combined_dataset(self) -> str:
|
|
"""Combine all datasets into training format"""
|
|
print("Creating combined training dataset...")
|
|
|
|
# Load all datasets
|
|
datasets = {}
|
|
dataset_files = {
|
|
"factual": "factual_qa.json",
|
|
"analysis": "literary_analysis.json",
|
|
"creative": "creative_writing.json",
|
|
}
|
|
|
|
for name, filename in dataset_files.items():
|
|
file_path = self.training_dir / filename
|
|
if file_path.exists():
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
datasets[name] = json.load(f)
|
|
|
|
# Combine all examples
|
|
combined_examples = []
|
|
for category, examples in datasets.items():
|
|
combined_examples.extend(examples)
|
|
|
|
# Save combined dataset
|
|
combined_file = self.training_dir / "the_trial_combined.json"
|
|
with open(combined_file, "w", encoding="utf-8") as f:
|
|
json.dump(combined_examples, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Created combined dataset with {len(combined_examples)} examples")
|
|
|
|
# Create dataset statistics
|
|
stats = {
|
|
"total_examples": len(combined_examples),
|
|
"categories": {name: len(examples) for name, examples in datasets.items()},
|
|
"estimated_tokens": len(combined_examples) * 150, # Rough estimate
|
|
}
|
|
|
|
stats_file = self.training_dir / "dataset_stats.json"
|
|
with open(stats_file, "w", encoding="utf-8") as f:
|
|
json.dump(stats, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Dataset statistics: {stats}")
|
|
|
|
return str(combined_file)
|
|
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
prep = TrialDataPrep()
|
|
|
|
# Step 1: Download source text
|
|
text_file = prep.download_gutenberg_text()
|
|
if not text_file:
|
|
print("Failed to download source text")
|
|
return
|
|
|
|
# Step 2: Parse chapters
|
|
chapters = prep.parse_chapters(text_file)
|
|
|
|
# Step 3: Create training datasets
|
|
factual_qa = prep.create_factual_qa_dataset(chapters)
|
|
literary_analysis = prep.create_literary_analysis_dataset(chapters)
|
|
creative_writing = prep.create_creative_writing_dataset(chapters)
|
|
|
|
# Step 4: Create combined dataset
|
|
combined_file = prep.create_combined_dataset()
|
|
|
|
print("\n" + "=" * 50)
|
|
print("Data Preparation Complete!")
|
|
print("=" * 50)
|
|
print(f"Chapters parsed: {len(chapters)}")
|
|
print(f"Factual Q&A pairs: {len(factual_qa)}")
|
|
print(f"Literary analysis examples: {len(literary_analysis)}")
|
|
print(f"Creative writing examples: {len(creative_writing)}")
|
|
print(f"Combined dataset: {combined_file}")
|
|
print("=" * 50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|