me.cavar.pg2tei.Gutenberg2TEI.java Source code

Java tutorial

Introduction

Here is the source code for me.cavar.pg2tei.Gutenberg2TEI.java

Source

/*
 * (C) 2012 by Damir Cavar
 *
 * Download the Project Gutenberg catalog.rdf file, fetch the RDF for each
 * individual book, generate TEI XML meta-header, fetch the HTML of the book,
 * convert to TEI XML.
 *
 * 
 * License:
 * ========
 * 
 * Copyright 2012 Damir Cavar (http://cavar.me/damir/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package me.cavar.pg2tei;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

/**
 *
 * @author Damir Cavar
 */
public class Gutenberg2TEI {

    /**
     *
     * @param outputFolder
     * @param catalogOutFN
     */
    public static void processRDF(String outputFolder, String catalogOutFN, String ebookURLStr) {
        System.out.println("Processing catalog.rdf");
        File rdfFile = new File(outputFolder, catalogOutFN);
        RDFParser myRdfP = new RDFParser(ebookURLStr, outputFolder);
        try {
            System.out.println(rdfFile.getAbsolutePath());
            myRdfP.parseDocument(rdfFile.getAbsolutePath());
        } catch (ParserConfigurationException | SAXNotRecognizedException | SAXNotSupportedException e) {
            Logger.getLogger(Fetcher.class.getName()).log(Level.SEVERE, null, e);
        }
        System.out.printf("Number of entries: %d\n", myRdfP.getEntryCounter());
    }

    /**
     *
     * @param catalogURLStr
     * @param outputFolder
     * @param catalogOutFN
     */
    public static void fetchRDF(String catalogURLStr, String outputFolder, String catalogOutFN) {
        Fetcher myFetcher = new Fetcher(catalogURLStr, outputFolder, catalogOutFN);
        try {
            URL catalogURL = new URL(catalogURLStr);
            File tmpFile = new File(catalogURL.getFile());
            File zipFile = new File(outputFolder, tmpFile.getName());
            File rdfFile = new File(outputFolder, catalogOutFN);

            if (!rdfFile.exists()) {
                if (!zipFile.exists()) {
                    // get the catalog.rdf.zip file
                    System.out.print("Fetching Gutenberg Catalog-file as RDF-Zip... writing to ");
                    System.out.println(zipFile);
                    myFetcher.getCatalog(catalogURL, zipFile);
                    System.out.println("Done");
                }
                // unzip the catalog.rdf.zip file to catalog.rdf
                System.out.print("Unzipping the Catalog-file... ");
                myFetcher.unZip(zipFile);
                System.out.println("Done");
            }
        } catch (IOException e) {
            System.out.println("IOError");
        }
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        // Process command line
        Options options = new Options();

        options.addOption("c", true, "Catalogue URL");
        options.addOption("o", true, "Output folder");
        // options.addOption("f", true, "Resulting output catalogue file name");
        options.addOption("h", false, "Help");

        // the individual RDF-files are at this URL:
        // The RDF-file name is this.idN + ".rdf"
        String ebookURLStr = "http://www.gutenberg.org/ebooks/";

        // the URL to the catalog.rdf
        String catalogURLStr = "http://www.gutenberg.org/feeds/catalog.rdf.zip";
        String outputFolder = ".";
        String catalogOutFN = "catalog.rdf";

        CommandLineParser parser;
        parser = new PosixParser();
        try {
            CommandLine cmd = parser.parse(options, args);
            if (cmd.hasOption("h")) {
                System.out.println("Project Gutenberg fetch RDF catalog, HTML-files and generate TEI XML");
                System.out.println("");
                return;
            }
            if (cmd.hasOption("c")) {
                catalogURLStr = cmd.getOptionValue("c");
            }
            if (cmd.hasOption("o")) {
                outputFolder = cmd.getOptionValue("o");
            }
            //if (cmd.hasOption("f")) {
            //    catalogOutFN = cmd.getOptionValue("f");
            //}

        } catch (ParseException ex) {
            System.out.println("Command line argument error:" + ex.getMessage());
        }

        // Do the fetching of the RDF catalog
        fetchRDF(catalogURLStr, outputFolder, catalogOutFN);

        // process the RDF file
        processRDF(outputFolder, catalogOutFN, ebookURLStr);
    }
}