com.edu.duke.URLResource.java Source code

Introduction

Here is the source code for com.edu.duke.URLResource.java
Source

package com.edu.duke;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;

import java.io.*;
import java.net.URL;

/**
 * The <code>URLResource</code> class opens a connection to a URL and allows access to the contents
 * of the web page a line at a time, using the method <code>lines</code>, or a word at a time, using
 * the method <code>words</code>. These strings can then be iterated over using a <code>for</code>
 * loop.
 * 
 * <P>
 * Example usage:
 * 
 * <PRE>
 * URLResource ur = new URLResource("http://www.dukelearntoprogram.com/");
 * for (String s : ur.lines()) {
 *     // print or process s
 * }
 * </PRE>
 *
 * <P>
 * If each line of the web page represents separated data values, because its a CSV file, then the
 * user can get a <code>getCSVParser</code> object to access that data more directly, using one of the
 * <code>getCSVParser</code> methods.
 * 
 * <P>
 * Example CSV usage:
 * 
 * <PRE>
 * URLResource ur = new URLResource("http://www.dukelearntoprogram.com/course2/java/food.csv");
 * for (CSVRecord record : ur.getCSVParser()) {
 *     // print or process fields in record
 *     String name = record.get("Name");
 *     // other processing
 * }
 * </PRE>
 * 
 * <P>
 * This software is licensed with an Apache 2 license, see
 * http://www.apache.org/licenses/LICENSE-2.0 for details.
 * 
 * @author Duke Software Team
 */
public class URLResource {
    private String myPath;
    private String mySource;

    /**
     * Create a <code>URLResource</code> object bound to the web page whose URL is given as the
     * parameter.
     * 
     * Constructing the object opens a connection and reads the contents of the web page.
     * 
     * @param name is the name of the URL, it must start with "http" or "https"
     */
    public URLResource(String name) {
        if (name.startsWith("http://") || name.startsWith("https://")) {
            try {
                mySource = initFromStream(new URL(name).openStream());
                myPath = name;
            } catch (Exception e) {
                throw new ResourceException("URLResource: unable to load URL with name " + name, e);
            }
        } else {
            throw new ResourceException("URLResource: name must start with http:// or https://" + name);
        }
    }

    /**
     * Allow access to open web page one line at a time.
     * 
     * @return an <code>Iterable</code> that allows access one line at a time
     */
    public Iterable<String> lines() {
        return new TextIterable(mySource, "\\n");
    }

    /**
     * Allow access to this open web page one word at a time, where words are separated by
     * white-space. This means any form of spaces, like tabs or newlines, can delimit words.
     * 
     * @return an <code>Iterable</code> that allows access one word at a time
     */
    public Iterable<String> words() {
        return new TextIterable(mySource, "\\s+");
    }

    /**
     * Return entire open web page as one string.
     * 
     * @return a <code>String</code> that is the contents of the open web page
     */
    public String asString() {
        return mySource;
    }

    /**
     * Returns a <code>CSVParser</code> object to access the contents of an open web page.
     * 
     * Each line of the web page should be formatted as data separated by commas and with a header
     * row to describe the column names.
     * 
     * @return a <code>CSVParser</code> that can provide access to the records in the web page one
     *         at a time
     */
    public CSVParser getCSVParser() {
        return getCSVParser(true);
    }

    /**
     * Returns a <code>CSVParser</code> object to access the contents of an open web page, possibly
     * without a header row.
     * 
     * Each line of the web page should be formatted as data separated by commas and with/without a
     * header row to describe the column names.
     * 
     * @param withHeader uses first row of data as a header row only if true
     * @return a <code>CSVParser</code> that can provide access to the records in the web page one
     *         at a time
     */
    public CSVParser getCSVParser(boolean withHeader) {
        return getCSVParser(withHeader, ",");
    }

    /**
     * Returns a <code>CSVParser</code> object to access the contents of an open web page, possibly
     * without a header row and a different data delimiter than a comma.
     * 
     * Each line of the web page should be formatted as data separated by the delimiter passed as a
     * parameter and with/without a header row to describe the column names. This is useful if the
     * data is separated by some character other than a comma.
     * 
     * @param withHeader uses first row of data as a header row only if true
     * @param delimiter a single character that separates one field of data from another
     * @return a <code>CSVParser</code> that can provide access to the records in the web page one
     *         at a time
     */
    public CSVParser getCSVParser(boolean withHeader, String delimiter) {
        if (delimiter == null || delimiter.length() != 1) {
            throw new ResourceException("URLResource: CSV delimiter must be a single character: " + delimiter);
        }
        try {
            char delim = delimiter.charAt(0);
            Reader input = new StringReader(mySource);
            if (withHeader) {
                return new CSVParser(input, CSVFormat.EXCEL.withHeader().withDelimiter(delim));
            } else {
                return new CSVParser(input, CSVFormat.EXCEL.withDelimiter(delim));
            }
        } catch (Exception e) {
            throw new ResourceException("URLResource: cannot read " + myPath + " as a CSV file.");
        }
    }

    /**
     * Allows access to the column names of the header row of a CSV file (the first line in the
     * file) one at a time. If the CSV file did not have a header row, then an empty
     * <code>Iterator</code> is returned.
     * 
     * @param parser the <code>CSVParser</code> that has been created for this web page
     * @return an <code>Iterable</code> that allows access one header name at a time
     */
    public Iterable<String> getCSVHeaders(CSVParser parser) {
        return parser.getHeaderMap().keySet();
    }

    // store data (keep in sync with URLResource)
    private String initFromStream(InputStream stream) {
        try (BufferedReader buff = new BufferedReader(new InputStreamReader(stream, "UTF-8"))) {
            StringBuilder contents = new StringBuilder();
            String line;
            while ((line = buff.readLine()) != null) {
                contents.append(line).append("\n");
            }
            return contents.toString();
        } catch (Exception e) {
            throw new ResourceException("URLResource: error encountered reading " + myPath, e);
        }
    }
}