spark.help.CustomNTriplesParser.java Source code

Introduction

Here is the source code for spark.help.CustomNTriplesParser.java
Source

/*******************************************************************************
 * Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *******************************************************************************/
package spark.help;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.HashSet;

import org.apache.commons.io.input.BOMInputStream;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
import org.eclipse.rdf4j.rio.helpers.NTriplesParserSettings;
import org.eclipse.rdf4j.rio.ntriples.NTriplesUtil;

import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
import scala.Tuple2;

public class CustomNTriplesParser extends AbstractRDFParser {

    protected Reader reader;

    protected long lineNo;

    ObjectBigArrayBigList<Tuple2<String, Tuple2<String, String>>> result = new ObjectBigArrayBigList<Tuple2<String, Tuple2<String, String>>>(
            50000);

    String subject;
    String predicate;
    String object;
    StringBuilder datatypeUri = new StringBuilder();
    StringBuilder languageTag = new StringBuilder();
    StringBuilder buffer = new StringBuilder();

    public ObjectBigArrayBigList<Tuple2<String, Tuple2<String, String>>> getResult() {
        return result;
    }

    /**
     * Implementation of the <tt>parse(InputStream, String)</tt> method defined in the RDFParser interface.
     * 
     * @param in
     *        The InputStream from which to read the data, must not be <tt>null</tt>. The InputStream is
     *        supposed to contain 7-bit US-ASCII characters, as per the N-Triples specification.
     * @param baseURI
     *        The URI associated with the data in the InputStream, must not be <tt>null</tt>.
     * @throws IOException
     *         If an I/O error occurred while data was read from the InputStream.
     * @throws RDFParseException
     *         If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *         If the configured statement handler encountered an unrecoverable error.
     * @throws IllegalArgumentException
     *         If the supplied input stream or base URI is <tt>null</tt>.
     */
    @Override
    public synchronized void parse(InputStream in, String baseURI)
            throws IOException, RDFParseException, RDFHandlerException {
        if (in == null) {
            throw new IllegalArgumentException("Input stream can not be 'null'");
        }
        // Note: baseURI will be checked in parse(Reader, String)

        try {
            parse(new InputStreamReader(new BOMInputStream(in, false), Charset.forName("UTF-8")), baseURI);
        } catch (UnsupportedEncodingException e) {
            // Every platform should support the UTF-8 encoding...
            throw new RuntimeException(e);
        }
    }

    /**
     * Implementation of the <tt>parse(Reader, String)</tt> method defined in the RDFParser interface.
     * 
     * @param reader
     *        The Reader from which to read the data, must not be <tt>null</tt>.
     * @param baseURI
     *        The URI associated with the data in the Reader, must not be <tt>null</tt>.
     * @throws IOException
     *         If an I/O error occurred while data was read from the InputStream.
     * @throws RDFParseException
     *         If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *         If the configured statement handler encountered an unrecoverable error.
     * @throws IllegalArgumentException
     *         If the supplied reader or base URI is <tt>null</tt>.
     */
    @Override
    public synchronized void parse(Reader reader, String baseURI)
            throws IOException, RDFParseException, RDFHandlerException {
        if (reader == null) {
            throw new IllegalArgumentException("Reader can not be 'null'");
        }
        if (baseURI == null) {
            throw new IllegalArgumentException("base URI can not be 'null'");
        }

        if (rdfHandler != null) {
            rdfHandler.startRDF();
        }
        result.clear();

        this.reader = reader;
        lineNo = 1;

        reportLocation(lineNo, 1);

        try {
            int c = readCodePoint();
            c = skipWhitespace(c);

            while (c != -1) {
                if (c == '#') {
                    // Comment, ignore
                    c = skipLine(c);
                } else if (c == '\r' || c == '\n') {
                    // Empty line, ignore
                    c = skipLine(c);
                } else {
                    c = parseTriple(c);
                }

                c = skipWhitespace(c);
            }
        } finally {
            clear();
        }

        if (rdfHandler != null) {
            rdfHandler.endRDF();
        }
    }

    /**
     * Reads characters from reader until it finds a character that is not a space or tab, and returns this
     * last character code point. In case the end of the character stream has been reached, -1 is returned.
     */
    protected int skipWhitespace(int c) throws IOException {
        while (c == ' ' || c == '\t') {
            c = readCodePoint();
        }

        return c;
    }

    /**
     * Verifies that there is only whitespace or comments until the end of the line.
     */
    protected int assertLineTerminates(int c) throws IOException, RDFParseException {
        c = readCodePoint();

        c = skipWhitespace(c);

        if (c == '#') {
            // c = skipToEndOfLine(c);
        } else {
            if (c != -1 && c != '\r' && c != '\n') {
                reportFatalError("Content after '.' is not allowed");
            }
        }

        return c;
    }

    /**
     * Reads characters from reader until the first EOL has been read. The EOL character or -1 is returned.
     */
    protected int skipToEndOfLine(int c) throws IOException {
        while (c != -1 && c != '\r' && c != '\n') {
            c = readCodePoint();
        }

        return c;
    }

    /**
     * Reads characters from reader until the first EOL has been read. The first character after the EOL is
     * returned. In case the end of the character stream has been reached, -1 is returned.
     */
    protected int skipLine(int c) throws IOException {
        while (c != -1 && c != '\r' && c != '\n') {
            c = readCodePoint();
        }

        // c is equal to -1, \r or \n. In case of a \r, we should
        // check whether it is followed by a \n.

        if (c == '\n') {
            c = readCodePoint();

            lineNo++;

            reportLocation(lineNo, 1);
        } else if (c == '\r') {
            c = readCodePoint();

            if (c == '\n') {
                c = readCodePoint();
            }

            lineNo++;

            reportLocation(lineNo, 1);
        }

        return c;
    }

    private int parseTriple(int c) throws IOException, RDFParseException, RDFHandlerException {
        boolean ignoredAnError = false;
        try {
            c = parseSubject(c, buffer);
            subject = buffer.toString();
            clearBuffer(buffer);

            c = skipWhitespace(c);

            c = parsePredicate(c, buffer);
            predicate = buffer.toString();
            clearBuffer(buffer);

            c = skipWhitespace(c);

            c = parseObject(c, buffer);
            object = buffer.toString();
            clearBuffer(buffer);
            clearBuffer(datatypeUri);
            clearBuffer(languageTag);

            c = skipWhitespace(c);

            if (c == -1) {
                throwEOFException();
            } else if (c != '.') {
                reportError("Expected '.', found: " + new String(Character.toChars(c)),
                        NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
            }

            c = assertLineTerminates(c);
        } catch (RDFParseException rdfpe) {
            if (getParserConfig().isNonFatalError(NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES)) {
                reportError(rdfpe, NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
                ignoredAnError = true;
            } else {
                throw rdfpe;
            }
        }

        c = skipLine(c);

        if (!ignoredAnError) {

            Tuple2<String, String> po = new Tuple2<String, String>(predicate, object);
            result.add(new Tuple2<String, Tuple2<String, String>>(subject, po));
            //result.add(new Tuple2<String,String>(subject,predicate+DataFormatter.SEPERATOR+object));
            /*Statement st = createStatement(subject, predicate, object);
            if (rdfHandler != null) {
               rdfHandler.handleStatement(st);
            }*/
        }

        subject = null;
        predicate = null;
        object = null;

        return c;
    }

    private void clearBuffer(StringBuilder b) {
        // TODO Auto-generated method stub
        b.delete(0, b.length());
        b.setLength(0);
    }

    protected int parseSubject(int c, StringBuilder sb) throws IOException, RDFParseException {
        //StringBuilder sb = new StringBuilder(100);

        // subject is either an uriref (<foo://bar>) or a nodeID (_:node1)
        if (c == '<') {
            // subject is an uriref

            c = parseUriRef(c, sb);
            //subject = sb.toString();
            //subject = createURI(sb.toString());
        } else if (c == '_') {
            // subject is a bNode

            c = parseNodeID(c, sb);
            //subject = sb.toString();
            //subject = createBNode(sb.toString());
        } else if (c == -1) {
            throwEOFException();
        } else {
            reportFatalError("Expected '<' or '_', found: " + new String(Character.toChars(c)));
        }
        //System.out.println("subject = "+subject);
        return c;
    }

    protected int parsePredicate(int c, StringBuilder sb) throws IOException, RDFParseException {
        //StringBuilder sb = new StringBuilder(100);

        // predicate must be an uriref (<foo://bar>)
        if (c == '<') {
            // predicate is an uriref
            //predicate = "";
            c = parseUriRef(c, sb);
            //predicate = sb.toString();
            //predicate = createURI(sb.toString());
        } else if (c == -1) {
            throwEOFException();
        } else {
            reportFatalError("Expected '<', found: " + new String(Character.toChars(c)));
        }

        return c;
    }

    protected int parseObject(int c, StringBuilder sb) throws IOException, RDFParseException {
        //StringBuilder sb = getBuffer();

        // object is either an uriref (<foo://bar>), a nodeID (_:node1) or a
        // literal ("foo"-en or "1"^^<xsd:integer>).
        if (c == '<') {
            // object is an uriref
            //object = "";
            c = parseUriRef(c, sb);
            //object = sb.toString();
            //object = createURI(sb.toString());
        } else if (c == '_') {
            // object is a bNode
            //object = "";
            c = parseNodeID(c, sb);
            //object = sb.toString();
            //object = createBNode(sb.toString());
        } else if (c == '"') {
            // object is a literal
            //object = "";
            //String lang = getLanguageTagBuffer();
            //String datatype = getDatatypeUriBuffer();
            c = parseLiteral(c, sb, this.languageTag, this.datatypeUri);

            //String o = "";
            //StringBuilder sb = new StringBuilder(label.length() * 2);

            sb.insert(0, '"');
            sb.append('"');
            //object = '"' + object + '"';

            if (this.languageTag.length() != 0) {
                sb.append('@');
                sb.append(this.languageTag.toString());
            } else if (this.datatypeUri.length() != 0) {
                sb.append("^^<");
                sb.append(this.datatypeUri.toString());
                sb.append(">");
            }

            //createLiteral(sb.toString(), lang.toString(), datatype.toString());
        } else if (c == -1) {
            throwEOFException();
        } else {
            reportFatalError("Expected '<', '_' or '\"', found: " + new String(Character.toChars(c)) + "");
        }

        return c;
    }

    protected int parseUriRef(int c, StringBuilder sb) throws IOException, RDFParseException {
        if (c != '<') {
            reportError("Supplied char should be a '<', is: " + new String(Character.toChars(c)),
                    NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
        }
        // Read up to the next '>' character
        c = readCodePoint();
        while (c != '>') {
            if (c == -1) {
                throwEOFException();
            }
            if (c == ' ') {
                reportError("IRI included an unencoded space: " + new String(Character.toChars(c)),
                        NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
            }
            sb.append(Character.toChars(c));

            if (c == '\\') {
                // This escapes the next character, which might be a '>'
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                if (c != 'u' && c != 'U') {
                    reportError("IRI includes string escapes: '\\" + c + "'",
                            NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
                }
                sb.append(Character.toChars(c));
            }

            c = readCodePoint();
        }

        // c == '>', read next char
        c = readCodePoint();

        return c;
    }

    protected int parseNodeID(int c, StringBuilder sb) throws IOException, RDFParseException {
        if (c != '_') {
            reportError("Supplied char should be a '_', is: " + new String(Character.toChars(c)),
                    NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
        }

        c = readCodePoint();
        if (c == -1) {
            throwEOFException();
        } else if (c != ':') {
            reportError("Expected ':', found: " + new String(Character.toChars(c)),
                    NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
        }

        c = readCodePoint();
        if (c == -1) {
            throwEOFException();
        } else if (!NTriplesUtil.isLetterOrNumber(c)) {
            reportError("Expected a letter or number, found: " + new String(Character.toChars(c)),
                    NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
        }
        sb.append(Character.toChars(c));

        // Read all following letter and numbers, they are part of the name
        c = readCodePoint();
        while (c != -1 && NTriplesUtil.isLetterOrNumber(c)) {
            sb.append(Character.toChars(c));
            c = readCodePoint();
        }

        return c;
    }

    private int parseLiteral(int c, StringBuilder sb, StringBuilder lang, StringBuilder datatype)
            throws IOException, RDFParseException {
        if (c != '"') {
            reportError("Supplied char should be a '\"', is: " + c,
                    NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
        }

        // Read up to the next '"' character
        c = readCodePoint();
        while (c != '"') {
            if (c == -1) {
                throwEOFException();
            }
            sb.append(Character.toChars(c));

            if (c == '\\') {
                // This escapes the next character, which might be a double quote
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                sb.append(Character.toChars(c));
            }

            c = readCodePoint();
        }

        // c == '"', read next char
        c = readCodePoint();

        if (c == '@') {
            // Read language
            c = readCodePoint();

            if (!NTriplesUtil.isLetter(c)) {
                reportError("Expected a letter, found: " + new String(Character.toChars(c)),
                        NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
            }

            while (c != -1 && c != '.' && c != '^' && c != ' ' && c != '\t') {
                this.languageTag.append(Character.toChars(c));
                c = readCodePoint();
            }
        } else if (c == '^') {
            // Read datatype
            c = readCodePoint();

            // c should be another '^'
            if (c == -1) {
                throwEOFException();
            } else if (c != '^') {
                reportError("Expected '^', found: " + new String(Character.toChars(c)),
                        NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
            }

            c = readCodePoint();

            // c should be a '<'
            if (c == -1) {
                throwEOFException();
            } else if (c != '<') {
                reportError("Expected '<', found: " + new String(Character.toChars(c)),
                        NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
            }

            c = parseUriRef(c, datatype);
        }

        return c;
    }

    @Override
    protected IRI createURI(String uri) throws RDFParseException {
        try {
            uri = NTriplesUtil.unescapeString(uri);
        } catch (IllegalArgumentException e) {
            reportError(e.getMessage(), NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
        }

        return super.createURI(uri);
    }

    /**
     * Reads the next Unicode code point.
     * 
     * @return the next Unicode code point, or -1 if the end of the stream has been reached.
     * @throws IOException
     */
    protected int readCodePoint() throws IOException {
        int next = reader.read();
        if (Character.isHighSurrogate((char) next)) {
            next = Character.toCodePoint((char) next, (char) reader.read());
        }
        return next;
    }

    protected Literal createLiteral(String label, String lang, String datatype) throws RDFParseException {
        try {
            label = NTriplesUtil.unescapeString(label);
        } catch (IllegalArgumentException e) {
            reportFatalError(e);
        }

        if (lang.length() == 0) {
            lang = null;
        }

        if (datatype.length() == 0) {
            datatype = null;
        }

        IRI dtURI = null;
        if (datatype != null) {
            dtURI = createURI(datatype);
        }

        return super.createLiteral(label, lang, dtURI, lineNo, -1);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportWarning(String)}, adding line number information to the error.
     */
    @Override
    protected void reportWarning(String msg) {
        reportWarning(msg, lineNo, -1);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportError(String, RioSetting)}, adding line number information to
     * the error.
     */
    @Override
    protected void reportError(String msg, RioSetting<Boolean> setting) throws RDFParseException {
        reportError(msg, lineNo, -1, setting);
    }

    protected void reportError(Exception e, RioSetting<Boolean> setting) throws RDFParseException {
        reportError(e, lineNo, -1, setting);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportFatalError(String)}, adding line number information to the
     * error.
     */
    @Override
    protected void reportFatalError(String msg) throws RDFParseException {
        reportFatalError(msg, lineNo, -1);
    }

    /**
     * Overrides {@link AbstractRDFParser#reportFatalError(Exception)}, adding line number information to the
     * error.
     */
    @Override
    protected void reportFatalError(Exception e) throws RDFParseException {
        reportFatalError(e, lineNo, -1);
    }

    protected void throwEOFException() throws RDFParseException {
        throw new RDFParseException("Unexpected end of file");
    }

    /**
     * Return a buffer of zero length and non-zero capacity. The same buffer is reused for each thing which is
     * parsed. This reduces the heap churn substantially. However, you have to watch out for side-effects and
     * convert the buffer to a {@link String} before the buffer is reused.
     * 
     * @return a buffer of zero length and non-zero capacity.
     */
    /*private StringBuilder getBuffer() {
       buffer.setLength(0);
       return buffer;
    }*/

    //private final StringBuilder buffer = new StringBuilder(100);

    /**
     * Return a buffer for the use of parsing literal language tags. The buffer is of zero length and non-zero
     * capacity. The same buffer is reused for each tag which is parsed. This reduces the heap churn
     * substantially. However, you have to watch out for side-effects and convert the buffer to a
     * {@link String} before the buffer is reused.
     * 
     * @return a buffer of zero length and non-zero capacity, for the use of parsing literal language tags.
     */
    /*private String getLanguageTagBuffer() {
       languageTag = "";
       //languageTagBuffer.setLength(0);
       return languageTag;
    }*/

    /**
     * Return a buffer for the use of parsing literal datatype URIs. The buffer is of zero length and non-zero
     * capacity. The same buffer is reused for each datatype which is parsed. This reduces the heap churn
     * substantially. However, you have to watch out for side-effects and convert the buffer to a
     * {@link String} before the buffer is reused.
     * 
     * @return a buffer of zero length and non-zero capacity, for the user of parsing literal datatype URIs.
     */
    /*private String getDatatypeUriBuffer() {
       datatypeUri = "";
       //datatypeUriBuffer.setLength(0);
       return datatypeUri;
    }*/

    @Override
    protected void clear() {
        super.clear();
        // get rid of anything large left in the buffers.
        //buffer.setLength(0);
        //buffer.trimToSize();
        //languageTagBuffer.setLength(0);
        //languageTagBuffer.trimToSize();
        //datatypeUriBuffer.setLength(0);
        //datatypeUriBuffer.trimToSize();
        //datatypeUri = "";
        //languageTag = "";
    }

    /*
     * N-Triples parser supports these settings.
     */
    @Override
    public Collection<RioSetting<?>> getSupportedSettings() {
        Collection<RioSetting<?>> result = new HashSet<RioSetting<?>>(super.getSupportedSettings());

        result.add(NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);

        return result;
    }

    @Override
    public RDFFormat getRDFFormat() {
        // TODO Auto-generated method stub
        return null;
    }

}