Automated Popular English Fiction Genre Classification System for 19th and early 20th Century Australian Newspaper Fiction Stories

Click here for the relevant post that explains what this is: Automated Popular English Fiction Genre Classification System

import openai
import os
import csv
import time

# Set up API key
openai.api_key = 'your_chatgpt_api_key'

# Directory containing text files and output CSV path
input_folder = r"C:\your_directory_address"
output_csv = r"C:\where_you_want_the_csv\gptapi_genre_analysis_results.csv"

# Token and rate limits
MAX_TOKENS_PER_FILE = 2000
BASE_RATE_LIMIT_DELAY = 4.0  # Base delay for rate limiting
MAX_RETRIES = 5  # Max retry attempts per file for rate limits

# Initialize CSV and write headers
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['File Name', 'Science Fiction Rating', 'SF Level (None/Mild/Moderate/High)', 'Plot Description', 'OCR Quality', 'Genre 1', 'Genre 2', 'Genre 3'])

# Function to analyze text using ChatCompletion with retry logic for server errors and rate limits
def analyze_text(file_name, content):
    if len(content.split()) > MAX_TOKENS_PER_FILE:
        content = " ".join(content.split()[:MAX_TOKENS_PER_FILE])

    # Define messages for Chat API with clear labels
    messages = [
        {"role": "system", "content": "You are an expert in early 20th-century science fiction and popular fiction literature."},
        {"role": "user", "content": (
            f"Analyze the following text for science fiction elements and suggest genres based on 19th and early 20th-century fiction categories:\n\n{content}\n\n"
            "Output the following information, with no extra commentary:\n"
            "1. Science Fiction Rating: Rate from 0 to 10, based only on scientifically plausible or speculative elements grounded in science.\n"
            "2. SF Level: Briefly state the level as 'none,' 'mild,' 'moderate,' or 'high,' indicating the degree of science fiction elements.\n"
            "3. Plot Description: Describe the general plot in one to two sentences (30 words or fewer), focusing on genre-relevant elements only.\n"
            "4. OCR Quality: Rate from A (clearly readable) to E (unreadable).\n"
            "5. Genres: Based on the plot description and themes, identify up to three genres from the following list:\n"
            "   - Gothic, Sensation, Social Problems, Didactic, Adventure, Utopian, Dystopian, Colonial, Imperialist, Domestic, Spiritualist, War, New Woman, Allegorical, Historical, Horror, Supernatural, Crime, Detective, Invention, Planetary, Lost Race, Romance, Western, Travel, Prediction\n"
            "If the story combines elements of multiple genres, provide them in separate fields, such as 'historical war' or 'prediction romance'."
        )}
    ]

    delay = BASE_RATE_LIMIT_DELAY
    retries = 0
    server_error_retries = 3  # Limit server error retries to 3 attempts

    while retries < MAX_RETRIES:
        try:
            # Make API request
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=messages,
                max_tokens=100,
                temperature=0.2
            )

            # Parse API response
            result_text = response.choices[0].message['content'].strip()
            lines = result_text.splitlines()
            sci_fi_rating, sf_level, plot_description, ocr_quality = None, None, None, None
            genre_1, genre_2, genre_3 = None, None, None
            for line in lines:
                if "Science Fiction Rating:" in line:
                    sci_fi_rating = line.split(":", 1)[1].strip()
                elif "SF Level:" in line:
                    sf_level = line.split(":", 1)[1].strip()
                elif "Plot Description:" in line:
                    plot_description = line.split(":", 1)[1].strip()
                elif "OCR Quality:" in line:
                    ocr_quality = line.split(":", 1)[1].strip()
                elif "Genres:" in line:
                    # Genres can be split into multiple columns
                    genres = line.split(":", 1)[1].strip().split(',')
                    genre_1 = genres[0].strip() if len(genres) > 0 else None
                    genre_2 = genres[1].strip() if len(genres) > 1 else None
                    genre_3 = genres[2].strip() if len(genres) > 2 else None

            # Check that all fields are filled
            if sci_fi_rating and sf_level and plot_description and ocr_quality:
                return [os.path.basename(file_name), sci_fi_rating, sf_level, plot_description, ocr_quality, genre_1, genre_2, genre_3]
            else:
                print(f"Incomplete response for file {os.path.basename(file_name)}.")
                return None

        except openai.error.RateLimitError as e:
            print(f"Rate limit reached for file {os.path.basename(file_name)}. Retrying in {delay} seconds...")
            time.sleep(delay)
            delay *= 2  # Exponential backoff for rate limits
            retries += 1

        except openai.error.APIError as e:
            # Handle server errors with a separate retry mechanism
            if "server_error" in str(e) and server_error_retries > 0:
                print(f"Server error encountered for file {os.path.basename(file_name)}. Retrying in 60 seconds...")
                time.sleep(60)
                server_error_retries -= 1
            else:
                print(f"Server error for file {os.path.basename(file_name)} after retries. Moving to next file.")
                return None

        except Exception as e:
            print(f"Error processing file {os.path.basename(file_name)}: {e}")
            return None
    
    print(f"Max retries reached for file {os.path.basename(file_name)}. Skipping.")
    return None

# Process each file in the folder
for file_name in os.listdir(input_folder):
    file_path = os.path.join(input_folder, file_name)

    if os.path.isfile(file_path) and file_name.endswith('.txt'):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                if len(content.strip()) == 0:
                    print(f"File {os.path.basename(file_path)} is empty. Skipping.")
                    continue

                # Analyze text and write result to CSV if valid
                result = analyze_text(file_name, content)
                if result:
                    with open(output_csv, mode='a', newline='', encoding='utf-8') as csv_file:
                        writer = csv.writer(csv_file)
                        writer.writerow(result)

                # Delay to respect base rate limit before next file
                time.sleep(BASE_RATE_LIMIT_DELAY)

        except Exception as e:
            print(f"Error reading file {os.path.basename(file_path)}: {e}")

print("Batch processing complete.")

Example of output:

File NameScience Fiction RatingSF Level (None/Mild/Moderate/High)Plot DescriptionOCR QualityGenre 1Genre 2Genre 3
1901 A Dead Finger.txt2mildA man encounters a mysterious, disembodied finger with a life of its own, causing him distress and confusion.CHorrorSupernaturalSensation
1901 Something to His Advantage.txt0noneA mysterious man confronts another about a murder, leading to revelations about identity and past actions.BCrimeDetectiveSensation
1901 The Prodigal of Glencourt A Romance of Maoriland.txt0noneA man named Sydney Black, recovering from illness, navigates a criminal underworld in New Zealand with Maori allies and outlaws.BColonialAdventureCrime
1901 The Cankerworm.txt0noneA young woman, Linda, faces personal turmoil and societal pressures as she seeks her missing husband, with the help of a devoted friend.CDomesticSocial ProblemsRomance
1901 Old House in Cripplegate.txt0noneA young man faces familial pressure to abandon his love interest due to his uncle’s unresolved past grievances.CDomesticHistoricalRomance
1901 A Lesson in Love A Complete Story.txt0noneA man contemplates marriage as a career necessity while navigating social expectations and romantic interests.CDomesticRomanceSocial Problems
1901 An Old Mans Darling.txt0noneA young girl encounters a fortune-teller who predicts a future filled with love, hate, and sorrow, leading to introspection.CRomanceSupernaturalGothic
1901 Salomy Janes Kiss.txt0noneA vigilante group captures two horse thieves, but one escapes after a surprising kiss from a local girl.CWesternAdventureCrime
1901 The Hearts Mistake.txt0noneSylvia becomes a companion to a widow, Mrs. Seymour, as they travel and navigate social expectations and personal ambitions.BDomesticSocial ProblemsTravel
1901 Lady Margots Leap.txt0noneA young woman, Lilian, under the watchful eye of her aunt, secretly meets a charming man with a mysterious past.BGothicDomesticRomance
1901 The Landlord of the Big Flume Hotel.txt0noneA divorced couple reunites at a hotel, discussing their past and potential future relationships.CDomesticWestern
1901 Nikolas Farewell.txt2mildA group of friends in Venice encounter the mysterious Nikola, whose presence evokes fear and curiosity due to his past experiments and enigmatic nature.BAdventureSupernaturalGothic

If you wish to use this code in your research, you’re most welcome. Please leave a comment. If you use it in your paper, please cite it. MLA example here:

Hogan, Neil. “Automated Popular English Fiction Genre Classification System for 19th and Early 20th Century Australian Newspaper Fiction Stories.” Retrieving “Science in Fiction” from Early 20th Century Australian Newspapers, Neil Hogan, 2 Nov. 2024, neilhogan.com/automated-popular-english-fiction-genre-classification-system-for-19th-and-early-20th-century-australian-newspaper-fiction-stories/

Leave a Reply

Your email address will not be published. Required fields are marked *

Verified by MonsterInsights