ca.nines.ise.dom.DOMStream.java Source code

Introduction

Here is the source code for ca.nines.ise.dom.DOMStream.java
Source

/*
 * Copyright (C) 2014 Michael Joyce <ubermichael@gmail.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation version 2.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package ca.nines.ise.dom;

import ca.nines.ise.log.Log;
import ca.nines.ise.log.Message;
import java.io.BufferedReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.Normalizer;
import java.text.Normalizer.Form;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;

/**
 * DOMStream determines an input stream's encoding and throws an error if it is
 * not UTF-8. It will also correct any typographic quotes to straight quotes,
 * and normalizes the input to NFKC.
 *
 * @author Michael Joyce <ubermichael@gmail.com>
 */
public class DOMStream {

    private final ByteOrderMark bom;
    private final String content;
    private final String encoding;
    private final ArrayList<String> lines;

    /**
     * Construct a DOMStream from an input stream and record the source of the
     * input data.
     *
     * @param in
     * @param source
     * @throws java.io.IOException
     */
    public DOMStream(InputStream in, String source) throws IOException {
        lines = new ArrayList<>();
        boolean warnedSmartQuotes = false;

        BOMInputStream bomStream = new BOMInputStream(in, ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE,
                ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
        bom = bomStream.getBOM();
        if (bom != null) {
            Message m = Message.builder("builder.bom").setSource(source)
                    .addNote("The byte order mark was " + bom.getCharsetName()).build();
            Log.addMessage(m);
            encoding = bom.getCharsetName();
        } else {
            encoding = "UTF-8";
        }

        if (!encoding.equals("UTF-8")) {
            Message m = Message.builder("builder.notutf8").setSource(source)
                    .addNote("The incorrect encoding is " + encoding).build();
            Log.addMessage(m);
        }

        BufferedReader buffer = new BufferedReader(new InputStreamReader(bomStream, encoding));
        String line;
        StringBuilder sb = new StringBuilder();

        Pattern p = Pattern.compile("\u201C|\u201D");

        while ((line = buffer.readLine()) != null) {
            line = Normalizer.normalize(line, Form.NFKC);
            Matcher m = p.matcher(line);
            if (m.find()) {
                line = m.replaceAll("\"");
                if (!warnedSmartQuotes) {
                    warnedSmartQuotes = true;
                    Message msg = Message.builder("builder.smartquotes").setSource(source)
                            .addNote("The first occurence of smart quotes was at line " + lines.size()).build();
                    Log.addMessage(msg);
                }
            }
            lines.add(line);
            sb.append(line).append("\n");
        }

        content = sb.toString().trim();
    }

    /**
     * Constructs a DOMString from a string. The resulting DOM source will be
     * "#STRING".
     *
     * @param input The string to parse.
     *
     * @throws java.io.IOException
     */
    public DOMStream(String input) throws IOException {
        this(IOUtils.toInputStream(input, "UTF-8"), "#STRING");
    }

    /**
     * Constructs a DOMStream from a File. The resulting DOM source will
     * return the absolute path to the file.
     *
     * @param input The file to read and parse.
     *
     * @throws FileNotFoundException if the file cannot be found.
     * @throws IOException if the file cannot be read.
     */
    public DOMStream(File input) throws FileNotFoundException, IOException {
        this(new FileInputStream(input), input.getCanonicalPath());
    }

    /**
     * Return the byte order mark, if there is one.
     *
     * @return ByteOrderMark
     */
    public ByteOrderMark getBOM() {
        return bom;
    }

    /**
     * Return the processed content of the file.
     *
     * @return String
     */
    public String getContent() {
        return content;
    }

    /**
     * Return the file's encoding.
     *
     * @return String
     */
    public String getEncoding() {
        return encoding;
    }

    /**
     * Return the list of lines found in the input.
     *
     * @return String[]
     */
    public String[] getLines() {
        return lines.toArray(new String[lines.size()]);
    }
}