org.openrdf.rio.turtle.TurtleParser.java Source code

Introduction

Here is the source code for org.openrdf.rio.turtle.TurtleParser.java
Source

/* 
 * Licensed to Aduna under one or more contributor license agreements.  
 * See the NOTICE.txt file distributed with this work for additional 
 * information regarding copyright ownership. 
 *
 * Aduna licenses this file to you under the terms of the Aduna BSD 
 * License (the "License"); you may not use this file except in compliance 
 * with the License. See the LICENSE.txt file distributed with this work 
 * for the full License.
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
 * implied. See the License for the specific language governing permissions
 * and limitations under the License.
 */
package org.openrdf.rio.turtle;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.input.BOMInputStream;

import info.aduna.text.ASCIIUtil;

import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.IRI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.SimpleValueFactory;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.XMLSchema;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RioSetting;
import org.openrdf.rio.helpers.BasicParserSettings;
import org.openrdf.rio.helpers.RDFParserBase;
import org.openrdf.rio.helpers.TurtleParserSettings;

/**
 * RDF parser for <a href="http://www.dajobe.org/2004/01/turtle/">Turtle</a>
 * files. This parser is not thread-safe, therefore its public methods are
 * synchronized.
 * <p>
 * This implementation is based on the 2006/01/02 version of the Turtle
 * specification, with slight deviations:
 * <ul>
 * <li>Normalization of integer, floating point and boolean values is dependent
 * on the specified datatype handling. According to the specification, integers
 * and booleans should be normalized, but floats don't.</li>
 * <li>Comments can be used anywhere in the document, and extend to the end of
 * the line. The Turtle grammar doesn't allow comments to be used inside triple
 * constructs that extend over multiple lines, but the author's own parser
 * deviates from this too.</li>
 * <li>The localname part of a prefixed named is allowed to start with a number
 * (cf. <a href="http://www.w3.org/TR/turtle/">the W3C Turtle Working
 * Draft</a>).</li>
 * </ul>
 * 
 * @author Arjohn Kampman
 */
public class TurtleParser extends RDFParserBase {

    /*-----------*
     * Variables *
     *-----------*/

    private PushbackReader reader;

    protected Resource subject;

    protected IRI predicate;

    protected Value object;

    private int lineNumber = 1;

    /*--------------*
     * Constructors *
     *--------------*/

    /**
     * Creates a new TurtleParser that will use a {@link SimpleValueFactory} to
     * create RDF model objects.
     */
    public TurtleParser() {
        super();
    }

    /**
     * Creates a new TurtleParser that will use the supplied ValueFactory to
     * create RDF model objects.
     * 
     * @param valueFactory
     *        A ValueFactory.
     */
    public TurtleParser(ValueFactory valueFactory) {
        super(valueFactory);
    }

    /*---------*
     * Methods *
     *---------*/

    public RDFFormat getRDFFormat() {
        return RDFFormat.TURTLE;
    }

    @Override
    public Collection<RioSetting<?>> getSupportedSettings() {
        Set<RioSetting<?>> result = new HashSet<RioSetting<?>>(super.getSupportedSettings());
        result.add(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES);
        return result;
    }

    /**
     * Implementation of the <tt>parse(InputStream, String)</tt> method defined
     * in the RDFParser interface.
     * 
     * @param in
     *        The InputStream from which to read the data, must not be
     *        <tt>null</tt>. The InputStream is supposed to contain UTF-8 encoded
     *        Unicode characters, as per the Turtle specification.
     * @param baseURI
     *        The URI associated with the data in the InputStream, must not be
     *        <tt>null</tt>.
     * @throws IOException
     *         If an I/O error occurred while data was read from the InputStream.
     * @throws RDFParseException
     *         If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *         If the configured statement handler encountered an unrecoverable
     *         error.
     * @throws IllegalArgumentException
     *         If the supplied input stream or base URI is <tt>null</tt>.
     */
    public synchronized void parse(InputStream in, String baseURI)
            throws IOException, RDFParseException, RDFHandlerException {
        if (in == null) {
            throw new IllegalArgumentException("Input stream must not be 'null'");
        }
        // Note: baseURI will be checked in parse(Reader, String)

        try {
            parse(new InputStreamReader(new BOMInputStream(in, false), "UTF-8"), baseURI);
        } catch (UnsupportedEncodingException e) {
            // Every platform should support the UTF-8 encoding...
            throw new RuntimeException(e);
        }
    }

    /**
     * Implementation of the <tt>parse(Reader, String)</tt> method defined in the
     * RDFParser interface.
     * 
     * @param reader
     *        The Reader from which to read the data, must not be <tt>null</tt>.
     * @param baseURI
     *        The URI associated with the data in the Reader, must not be
     *        <tt>null</tt>.
     * @throws IOException
     *         If an I/O error occurred while data was read from the InputStream.
     * @throws RDFParseException
     *         If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *         If the configured statement handler encountered an unrecoverable
     *         error.
     * @throws IllegalArgumentException
     *         If the supplied reader or base URI is <tt>null</tt>.
     */
    public synchronized void parse(Reader reader, String baseURI)
            throws IOException, RDFParseException, RDFHandlerException {
        if (reader == null) {
            throw new IllegalArgumentException("Reader must not be 'null'");
        }
        if (baseURI == null) {
            throw new IllegalArgumentException("base URI must not be 'null'");
        }

        if (rdfHandler != null) {
            rdfHandler.startRDF();
        }

        // Start counting lines at 1:
        lineNumber = 1;

        // Allow at most 8 characters to be pushed back:
        this.reader = new PushbackReader(reader, 8);

        // Store normalized base URI
        setBaseURI(baseURI);

        reportLocation();

        try {
            int c = skipWSC();

            while (c != -1) {
                parseStatement();
                c = skipWSC();
            }
        } finally {
            clear();
        }

        if (rdfHandler != null) {
            rdfHandler.endRDF();
        }
    }

    protected void parseStatement() throws IOException, RDFParseException, RDFHandlerException {

        StringBuilder sb = new StringBuilder(8);

        int codePoint;
        // longest valid directive @prefix
        do {
            codePoint = readCodePoint();
            if (codePoint == -1 || TurtleUtil.isWhitespace(codePoint)) {
                unread(codePoint);
                break;
            }
            sb.append(Character.toChars(codePoint));
        } while (sb.length() < 8);

        String directive = sb.toString();

        if (directive.startsWith("@") || directive.equalsIgnoreCase("prefix")
                || directive.equalsIgnoreCase("base")) {
            parseDirective(directive);
            skipWSC();
            // SPARQL BASE and PREFIX lines do not end in .
            if (directive.startsWith("@")) {
                verifyCharacterOrFail(readCodePoint(), ".");
            }
        } else {
            unread(directive);
            parseTriples();
            skipWSC();
            verifyCharacterOrFail(readCodePoint(), ".");
        }
    }

    protected void parseDirective(String directive) throws IOException, RDFParseException, RDFHandlerException {
        if (directive.length() >= 7 && directive.substring(0, 7).equals("@prefix")) {
            if (directive.length() > 7) {
                unread(directive.substring(7));
            }
            parsePrefixID();
        } else if (directive.length() >= 5 && directive.substring(0, 5).equals("@base")) {
            if (directive.length() > 5) {
                unread(directive.substring(5));
            }
            parseBase();
        } else if (directive.length() >= 6 && directive.substring(0, 6).equalsIgnoreCase("prefix")) {
            // SPARQL doesn't require whitespace after directive, so must unread if
            // we found part of the prefixID
            if (directive.length() > 6) {
                unread(directive.substring(6));
            }
            parsePrefixID();
        } else if ((directive.length() >= 4 && directive.substring(0, 4).equalsIgnoreCase("base"))) {
            if (directive.length() > 4) {
                unread(directive.substring(4));
            }
            parseBase();
        } else if (directive.length() >= 7 && directive.substring(0, 7).equalsIgnoreCase("@prefix")) {
            if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
                reportFatalError("Cannot strictly support case-insensitive @prefix directive in compliance mode.");
            }
            if (directive.length() > 7) {
                unread(directive.substring(7));
            }
            parsePrefixID();
        } else if (directive.length() >= 5 && directive.substring(0, 5).equalsIgnoreCase("@base")) {
            if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) {
                reportFatalError("Cannot strictly support case-insensitive @base directive in compliance mode.");
            }
            if (directive.length() > 5) {
                unread(directive.substring(5));
            }
            parseBase();
        } else if (directive.length() == 0) {
            reportFatalError("Directive name is missing, expected @prefix or @base");
        } else {
            reportFatalError("Unknown directive \"" + directive + "\"");
        }
    }

    protected void parsePrefixID() throws IOException, RDFParseException, RDFHandlerException {
        skipWSC();

        // Read prefix ID (e.g. "rdf:" or ":")
        StringBuilder prefixID = new StringBuilder(8);

        while (true) {
            int c = readCodePoint();

            if (c == ':') {
                unread(c);
                break;
            } else if (TurtleUtil.isWhitespace(c)) {
                break;
            } else if (c == -1) {
                throwEOFException();
            }

            prefixID.append(Character.toChars(c));
        }

        skipWSC();

        verifyCharacterOrFail(readCodePoint(), ":");

        skipWSC();

        // Read the namespace URI
        IRI namespace = parseURI();

        // Store and report this namespace mapping
        String prefixStr = prefixID.toString();
        String namespaceStr = namespace.toString();

        setNamespace(prefixStr, namespaceStr);

        if (rdfHandler != null) {
            rdfHandler.handleNamespace(prefixStr, namespaceStr);
        }
    }

    protected void parseBase() throws IOException, RDFParseException, RDFHandlerException {
        skipWSC();

        IRI baseURI = parseURI();

        setBaseURI(baseURI.toString());
    }

    protected void parseTriples() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        // If the first character is an open bracket we need to decide which of
        // the two parsing methods for blank nodes to use
        if (c == '[') {
            c = readCodePoint();
            skipWSC();
            c = peekCodePoint();
            if (c == ']') {
                c = readCodePoint();
                subject = createBNode();
                skipWSC();
                parsePredicateObjectList();
            } else {
                unread('[');
                subject = parseImplicitBlank();
            }
            skipWSC();
            c = peekCodePoint();

            // if this is not the end of the statement, recurse into the list of
            // predicate and objects, using the subject parsed above as the subject
            // of the statement.
            if (c != '.') {
                parsePredicateObjectList();
            }
        } else {
            parseSubject();
            skipWSC();
            parsePredicateObjectList();
        }

        subject = null;
        predicate = null;
        object = null;
    }

    protected void parsePredicateObjectList() throws IOException, RDFParseException, RDFHandlerException {
        predicate = parsePredicate();

        skipWSC();

        parseObjectList();

        while (skipWSC() == ';') {
            readCodePoint();

            int c = skipWSC();

            if (c == '.' || // end of triple
                    c == ']' || c == '}') // end of predicateObjectList inside blank
                                          // node
            {
                break;
            } else if (c == ';') {
                // empty predicateObjectList, skip to next
                continue;
            }

            predicate = parsePredicate();

            skipWSC();

            parseObjectList();
        }
    }

    protected void parseObjectList() throws IOException, RDFParseException, RDFHandlerException {
        parseObject();

        while (skipWSC() == ',') {
            readCodePoint();
            skipWSC();
            parseObject();
        }
    }

    protected void parseSubject() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        if (c == '(') {
            subject = parseCollection();
        } else if (c == '[') {
            subject = parseImplicitBlank();
        } else {
            Value value = parseValue();

            if (value instanceof Resource) {
                subject = (Resource) value;
            } else {
                reportFatalError("Illegal subject value: " + value);
            }
        }
    }

    protected IRI parsePredicate() throws IOException, RDFParseException, RDFHandlerException {
        // Check if the short-cut 'a' is used
        int c1 = readCodePoint();

        if (c1 == 'a') {
            int c2 = readCodePoint();

            if (TurtleUtil.isWhitespace(c2)) {
                // Short-cut is used, return the rdf:type URI
                return RDF.TYPE;
            }

            // Short-cut is not used, unread all characters
            unread(c2);
        }
        unread(c1);

        // Predicate is a normal resource
        Value predicate = parseValue();
        if (predicate instanceof IRI) {
            return (IRI) predicate;
        } else {
            reportFatalError("Illegal predicate value: " + predicate);
            return null;
        }
    }

    protected void parseObject() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        if (c == '(') {
            object = parseCollection();
        } else if (c == '[') {
            object = parseImplicitBlank();
        } else {
            object = parseValue();
        }

        reportStatement(subject, predicate, object);
    }

    /**
     * Parses a collection, e.g. <tt>( item1 item2 item3 )</tt>.
     */
    protected Resource parseCollection() throws IOException, RDFParseException, RDFHandlerException {
        verifyCharacterOrFail(readCodePoint(), "(");

        int c = skipWSC();

        if (c == ')') {
            // Empty list
            readCodePoint();
            return RDF.NIL;
        } else {
            BNode listRoot = createBNode();

            // Remember current subject and predicate
            Resource oldSubject = subject;
            IRI oldPredicate = predicate;

            // generated bNode becomes subject, predicate becomes rdf:first
            subject = listRoot;
            predicate = RDF.FIRST;

            parseObject();

            BNode bNode = listRoot;

            while (skipWSC() != ')') {
                // Create another list node and link it to the previous
                BNode newNode = createBNode();
                reportStatement(bNode, RDF.REST, newNode);

                // New node becomes the current
                subject = bNode = newNode;

                parseObject();
            }

            // Skip ')'
            readCodePoint();

            // Close the list
            reportStatement(bNode, RDF.REST, RDF.NIL);

            // Restore previous subject and predicate
            subject = oldSubject;
            predicate = oldPredicate;

            return listRoot;
        }
    }

    /**
     * Parses an implicit blank node. This method parses the token <tt>[]</tt>
     * and predicateObjectLists that are surrounded by square brackets.
     */
    protected Resource parseImplicitBlank() throws IOException, RDFParseException, RDFHandlerException {
        verifyCharacterOrFail(readCodePoint(), "[");

        BNode bNode = createBNode();

        int c = readCodePoint();
        if (c != ']') {
            unread(c);

            // Remember current subject and predicate
            Resource oldSubject = subject;
            IRI oldPredicate = predicate;

            // generated bNode becomes subject
            subject = bNode;

            // Enter recursion with nested predicate-object list
            skipWSC();

            parsePredicateObjectList();

            skipWSC();

            // Read closing bracket
            verifyCharacterOrFail(readCodePoint(), "]");

            // Restore previous subject and predicate
            subject = oldSubject;
            predicate = oldPredicate;
        }

        return bNode;
    }

    /**
     * Parses an RDF value. This method parses uriref, qname, node ID, quoted
     * literal, integer, double and boolean.
     */
    protected Value parseValue() throws IOException, RDFParseException, RDFHandlerException {
        int c = peekCodePoint();

        if (c == '<') {
            // uriref, e.g. <foo://bar>
            return parseURI();
        } else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) {
            // qname or boolean
            return parseQNameOrBoolean();
        } else if (c == '_') {
            // node ID, e.g. _:n1
            return parseNodeID();
        } else if (c == '"' || c == '\'') {
            // quoted literal, e.g. "foo" or """foo""" or 'foo' or '''foo'''
            return parseQuotedLiteral();
        } else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+' || c == '-') {
            // integer or double, e.g. 123 or 1.2e3
            return parseNumber();
        } else if (c == -1) {
            throwEOFException();
            return null;
        } else {
            reportFatalError("Expected an RDF value here, found '" + new String(Character.toChars(c)) + "'");
            return null;
        }
    }

    /**
     * Parses a quoted string, optionally followed by a language tag or datatype.
     */
    protected Literal parseQuotedLiteral() throws IOException, RDFParseException, RDFHandlerException {
        String label = parseQuotedString();

        // Check for presence of a language tag or datatype
        int c = peekCodePoint();

        if (c == '@') {
            readCodePoint();

            // Read language
            StringBuilder lang = new StringBuilder(8);

            c = readCodePoint();
            if (c == -1) {
                throwEOFException();
            }

            boolean verifyLanguageTag = getParserConfig().get(BasicParserSettings.VERIFY_LANGUAGE_TAGS);
            if (verifyLanguageTag && !TurtleUtil.isLanguageStartChar(c)) {
                reportError("Expected a letter, found '" + new String(Character.toChars(c)) + "'",
                        BasicParserSettings.VERIFY_LANGUAGE_TAGS);
            }

            lang.append(Character.toChars(c));

            c = readCodePoint();
            while (!TurtleUtil.isWhitespace(c)) {
                // SES-1887 : Flexibility introduced for SES-1985 and SES-1821 needs
                // to be counterbalanced against legitimate situations where Turtle
                // language tags do not need whitespace following the language tag
                if (c == '.' || c == ';' || c == ',' || c == ')' || c == ']' || c == -1) {
                    break;
                }
                if (verifyLanguageTag && !TurtleUtil.isLanguageChar(c)) {
                    reportError("Illegal language tag char: '" + new String(Character.toChars(c)) + "'",
                            BasicParserSettings.VERIFY_LANGUAGE_TAGS);
                }
                lang.append(Character.toChars(c));
                c = readCodePoint();
            }

            unread(c);

            return createLiteral(label, lang.toString(), null, getLineNumber(), -1);
        } else if (c == '^') {
            readCodePoint();

            // next character should be another '^'
            verifyCharacterOrFail(readCodePoint(), "^");

            skipWSC();

            // Read datatype
            Value datatype = parseValue();
            if (datatype instanceof IRI) {
                return createLiteral(label, null, (IRI) datatype, getLineNumber(), -1);
            } else {
                reportFatalError("Illegal datatype value: " + datatype);
                return null;
            }
        } else {
            return createLiteral(label, null, null, getLineNumber(), -1);
        }
    }

    /**
     * Parses a quoted string, which is either a "normal string" or a """long
     * string""".
     */
    protected String parseQuotedString() throws IOException, RDFParseException {
        String result = null;

        int c1 = readCodePoint();

        // First character should be '"' or "'"
        verifyCharacterOrFail(c1, "\"\'");

        // Check for long-string, which starts and ends with three double quotes
        int c2 = readCodePoint();
        int c3 = readCodePoint();

        if ((c1 == '"' && c2 == '"' && c3 == '"') || (c1 == '\'' && c2 == '\'' && c3 == '\'')) {
            // Long string
            result = parseLongString(c2);
        } else {
            // Normal string
            unread(c3);
            unread(c2);

            result = parseString(c1);
        }

        // Unescape any escape sequences
        try {
            result = TurtleUtil.decodeString(result);
        } catch (IllegalArgumentException e) {
            reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
        }

        return result;
    }

    /**
     * Parses a "normal string". This method requires that the opening character
     * has already been parsed.
     */
    protected String parseString(int closingCharacter) throws IOException, RDFParseException {
        StringBuilder sb = new StringBuilder(32);

        while (true) {
            int c = readCodePoint();

            if (c == closingCharacter) {
                break;
            } else if (c == -1) {
                throwEOFException();
            }

            sb.append(Character.toChars(c));

            if (c == '\\') {
                // This escapes the next character, which might be a '"'
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                sb.append(Character.toChars(c));
            }
        }

        return sb.toString();
    }

    /**
     * Parses a """long string""". This method requires that the first three
     * characters have already been parsed.
     */
    protected String parseLongString(int closingCharacter) throws IOException, RDFParseException {
        StringBuilder sb = new StringBuilder(1024);

        int doubleQuoteCount = 0;
        int c;

        while (doubleQuoteCount < 3) {
            c = readCodePoint();

            if (c == -1) {
                throwEOFException();
            } else if (c == closingCharacter) {
                doubleQuoteCount++;
            } else {
                doubleQuoteCount = 0;
            }

            sb.append(Character.toChars(c));

            if (c == '\\') {
                // This escapes the next character, which might be a '"'
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                sb.append(Character.toChars(c));
            }
        }

        return sb.substring(0, sb.length() - 3);
    }

    protected Literal parseNumber() throws IOException, RDFParseException {
        StringBuilder value = new StringBuilder(8);
        IRI datatype = XMLSchema.INTEGER;

        int c = readCodePoint();

        // read optional sign character
        if (c == '+' || c == '-') {
            value.append(Character.toChars(c));
            c = readCodePoint();
        }

        while (ASCIIUtil.isNumber(c)) {
            value.append(Character.toChars(c));
            c = readCodePoint();
        }

        if (c == '.' || c == 'e' || c == 'E') {

            // read optional fractional digits
            if (c == '.') {

                if (TurtleUtil.isWhitespace(peekCodePoint())) {
                    // We're parsing an integer that did not have a space before the
                    // period to end the statement
                } else {
                    value.append(Character.toChars(c));

                    c = readCodePoint();

                    while (ASCIIUtil.isNumber(c)) {
                        value.append(Character.toChars(c));
                        c = readCodePoint();
                    }

                    if (value.length() == 1) {
                        // We've only parsed a '.'
                        reportFatalError("Object for statement missing");
                    }

                    // We're parsing a decimal or a double
                    datatype = XMLSchema.DECIMAL;
                }
            } else {
                if (value.length() == 0) {
                    // We've only parsed an 'e' or 'E'
                    reportFatalError("Object for statement missing");
                }
            }

            // read optional exponent
            if (c == 'e' || c == 'E') {
                datatype = XMLSchema.DOUBLE;
                value.append(Character.toChars(c));

                c = readCodePoint();
                if (c == '+' || c == '-') {
                    value.append(Character.toChars(c));
                    c = readCodePoint();
                }

                if (!ASCIIUtil.isNumber(c)) {
                    reportError("Exponent value missing", BasicParserSettings.VERIFY_DATATYPE_VALUES);
                }

                value.append(Character.toChars(c));

                c = readCodePoint();
                while (ASCIIUtil.isNumber(c)) {
                    value.append(Character.toChars(c));
                    c = readCodePoint();
                }
            }
        }

        // Unread last character, it isn't part of the number
        unread(c);

        // String label = value.toString();
        // if (datatype.equals(XMLSchema.INTEGER)) {
        // try {
        // label = XMLDatatypeUtil.normalizeInteger(label);
        // }
        // catch (IllegalArgumentException e) {
        // // Note: this should never happen because of the parse constraints
        // reportError("Illegal integer value: " + label);
        // }
        // }
        // return createLiteral(label, null, datatype);

        // Return result as a typed literal
        return createLiteral(value.toString(), null, datatype, getLineNumber(), -1);
    }

    protected IRI parseURI() throws IOException, RDFParseException {
        StringBuilder uriBuf = new StringBuilder(100);

        // First character should be '<'
        int c = readCodePoint();
        verifyCharacterOrFail(c, "<");

        // Read up to the next '>' character
        while (true) {
            c = readCodePoint();

            if (c == '>') {
                break;
            } else if (c == -1) {
                throwEOFException();
            }

            if (c == ' ') {
                reportFatalError("IRI included an unencoded space: '" + c + "'");
            }

            uriBuf.append(Character.toChars(c));

            if (c == '\\') {
                // This escapes the next character, which might be a '>'
                c = readCodePoint();
                if (c == -1) {
                    throwEOFException();
                }
                if (c != 'u' && c != 'U') {
                    reportFatalError("IRI includes string escapes: '\\" + c + "'");
                }
                uriBuf.append(Character.toChars(c));
            }
        }

        if (c == '.') {
            reportFatalError("IRI must not end in a '.'");
        }

        String uri = uriBuf.toString();

        // Unescape any escape sequences
        try {
            // FIXME: The following decodes \n and similar in URIs, which should be
            // invalid according to test <turtle-syntax-bad-uri-04.ttl>
            uri = TurtleUtil.decodeString(uri);
        } catch (IllegalArgumentException e) {
            reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES);
        }

        return super.resolveURI(uri);
    }

    /**
     * Parses qnames and boolean values, which have equivalent starting
     * characters.
     */
    protected Value parseQNameOrBoolean() throws IOException, RDFParseException {
        // First character should be a ':' or a letter
        int c = readCodePoint();
        if (c == -1) {
            throwEOFException();
        }
        if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) {
            reportError("Expected a ':' or a letter, found '" + new String(Character.toChars(c)) + "'",
                    BasicParserSettings.VERIFY_RELATIVE_URIS);
        }

        String namespace = null;

        if (c == ':') {
            // qname using default namespace
            namespace = getNamespace("");
        } else {
            // c is the first letter of the prefix
            StringBuilder prefix = new StringBuilder(8);
            prefix.append(Character.toChars(c));

            int previousChar = c;
            c = readCodePoint();
            while (TurtleUtil.isPrefixChar(c)) {
                prefix.append(Character.toChars(c));
                previousChar = c;
                c = readCodePoint();
            }

            if (c != ':') {
                // prefix may actually be a boolean value
                String value = prefix.toString();

                if (value.equals("true") || value.equals("false")) {
                    unread(c);
                    return createLiteral(value, null, XMLSchema.BOOLEAN, getLineNumber(), -1);
                }
            } else {
                if (previousChar == '.') {
                    // '.' is a legal prefix name char, but can not appear at the end
                    reportFatalError("prefix can not end with with '.'");
                }
            }

            verifyCharacterOrFail(c, ":");

            namespace = getNamespace(prefix.toString());
        }

        // c == ':', read optional local name
        StringBuilder localName = new StringBuilder(16);
        c = readCodePoint();
        if (TurtleUtil.isNameStartChar(c)) {
            if (c == '\\') {
                localName.append(readLocalEscapedChar());
            } else {
                localName.append(Character.toChars(c));
            }

            int previousChar = c;
            c = readCodePoint();
            while (TurtleUtil.isNameChar(c)) {
                if (c == '\\') {
                    localName.append(readLocalEscapedChar());
                } else {
                    localName.append(Character.toChars(c));
                }
                previousChar = c;
                c = readCodePoint();
            }

            // Unread last character
            unread(c);

            if (previousChar == '.') {
                // '.' is a legal name char, but can not appear at the end, so is
                // not actually part of the name
                unread(previousChar);
                localName.deleteCharAt(localName.length() - 1);
            }
        } else {
            // Unread last character
            unread(c);
        }

        String localNameString = localName.toString();

        for (int i = 0; i < localNameString.length(); i++) {
            if (localNameString.charAt(i) == '%') {
                if (i > localNameString.length() - 3 || !ASCIIUtil.isHex(localNameString.charAt(i + 1))
                        || !ASCIIUtil.isHex(localNameString.charAt(i + 2))) {
                    reportFatalError("Found incomplete percent-encoded sequence: " + localNameString);
                }
            }
        }

        // if (c == '.') {
        // reportFatalError("Blank node identifier must not end in a '.'");
        // }

        // Note: namespace has already been resolved
        return createURI(namespace + localNameString);
    }

    private char readLocalEscapedChar() throws RDFParseException, IOException {
        int c = readCodePoint();

        if (TurtleUtil.isLocalEscapedChar(c)) {
            return (char) c;
        } else {
            throw new RDFParseException("found '" + new String(Character.toChars(c)) + "', expected one of: "
                    + Arrays.toString(TurtleUtil.LOCAL_ESCAPED_CHARS));
        }
    }

    /**
     * Parses a blank node ID, e.g. <tt>_:node1</tt>.
     */
    protected BNode parseNodeID() throws IOException, RDFParseException {
        // Node ID should start with "_:"
        verifyCharacterOrFail(readCodePoint(), "_");
        verifyCharacterOrFail(readCodePoint(), ":");

        // Read the node ID
        int c = readCodePoint();
        if (c == -1) {
            throwEOFException();
        } else if (!TurtleUtil.isBLANK_NODE_LABEL_StartChar(c)) {
            reportError("Expected a letter, found '" + (char) c + "'", BasicParserSettings.PRESERVE_BNODE_IDS);
        }

        StringBuilder name = new StringBuilder(32);
        name.append(Character.toChars(c));

        // Read all following letter and numbers, they are part of the name
        c = readCodePoint();

        // If we would never go into the loop we must unread now
        if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
            unread(c);
        }

        while (TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
            int previous = c;
            c = readCodePoint();

            if (previous == '.' && (c == -1 || TurtleUtil.isWhitespace(c) || c == '<' || c == '_')) {
                unread(c);
                unread(previous);
                break;
            }
            name.append((char) previous);
            if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) {
                unread(c);
            }
        }

        return createBNode(name.toString());
    }

    protected void reportStatement(Resource subj, IRI pred, Value obj)
            throws RDFParseException, RDFHandlerException {
        Statement st = createStatement(subj, pred, obj);
        if (rdfHandler != null) {
            rdfHandler.handleStatement(st);
        }
    }

    /**
     * Verifies that the supplied character code point <tt>codePoint</tt> is one
     * of the expected characters specified in <tt>expected</tt>. This method
     * will throw a <tt>ParseException</tt> if this is not the case.
     */
    protected void verifyCharacterOrFail(int codePoint, String expected) throws RDFParseException {
        if (codePoint == -1) {
            throwEOFException();
        }

        final String supplied = new String(Character.toChars(codePoint));

        if (expected.indexOf(supplied) == -1) {
            StringBuilder msg = new StringBuilder(32);
            msg.append("Expected ");
            for (int i = 0; i < expected.length(); i++) {
                if (i > 0) {
                    msg.append(" or ");
                }
                msg.append('\'');
                msg.append(expected.charAt(i));
                msg.append('\'');
            }
            msg.append(", found '");
            msg.append(supplied);
            msg.append("'");

            reportFatalError(msg.toString());
        }
    }

    /**
     * Consumes any white space characters (space, tab, line feed, newline) and
     * comments (#-style) from <tt>reader</tt>. After this method has been
     * called, the first character that is returned by <tt>reader</tt> is either
     * a non-ignorable character, or EOF. For convenience, this character is also
     * returned by this method.
     * 
     * @return The next character code point that will be returned by
     *         <tt>reader</tt>.
     */
    protected int skipWSC() throws IOException, RDFHandlerException {
        int c = readCodePoint();
        while (TurtleUtil.isWhitespace(c) || c == '#') {
            if (c == '#') {
                processComment();
            } else if (c == '\n') {
                // we only count line feeds (LF), not carriage return (CR), as
                // normally a CR is immediately followed by a LF.
                lineNumber++;
            }

            c = readCodePoint();
        }

        unread(c);

        return c;
    }

    /**
     * Consumes characters from reader until the first EOL has been read. This
     * line of text is then passed to the {@link #rdfHandler} as a comment.
     */
    protected void processComment() throws IOException, RDFHandlerException {
        StringBuilder comment = new StringBuilder(64);
        int c = readCodePoint();
        while (c != -1 && c != 0xD && c != 0xA) {
            comment.append(Character.toChars(c));
            c = readCodePoint();
        }

        // c is equal to -1, \r or \n.
        // In case c is equal to \r, we should also read a following \n.
        if (c == 0xD) {
            c = readCodePoint();

            if (c != 0xA) {
                unread(c);
            }
        }
        if (rdfHandler != null) {
            rdfHandler.handleComment(comment.toString());
        }
        reportLocation();
    }

    /**
     * Reads the next Unicode code point.
     * 
     * @return the next Unicode code point, or -1 if the end of the stream has
     *         been reached.
     * @throws IOException
     */
    protected int readCodePoint() throws IOException {
        int next = reader.read();
        if (Character.isHighSurrogate((char) next)) {
            next = Character.toCodePoint((char) next, (char) reader.read());
        }
        return next;
    }

    /**
     * Pushes back a single code point by copying it to the front of the buffer.
     * After this method returns, a call to {@link #readCodePoint()} will return
     * the same code point c again.
     * 
     * @param codePoint
     *        a single Unicode code point.
     * @throws IOException
     */
    protected void unread(int codePoint) throws IOException {
        if (codePoint != -1) {
            if (Character.isSupplementaryCodePoint(codePoint)) {
                final char[] surrogatePair = Character.toChars(codePoint);
                reader.unread(surrogatePair);
            } else {
                reader.unread(codePoint);
            }
        }
    }

    /**
     * Pushes back the supplied string by copying it to the front of the buffer.
     * After this method returns, successive calls to {@link #readCodePoint()}
     * will return the code points in the supplied string again, starting at the
     * first in the String..
     * 
     * @param string
     *        the string to un-read.
     * @throws IOException
     */
    protected void unread(String string) throws IOException {
        for (int i = string.codePointCount(0, string.length()); i >= 1; i--) {
            final int codePoint = string.codePointBefore(i);
            if (Character.isSupplementaryCodePoint(codePoint)) {
                final char[] surrogatePair = Character.toChars(codePoint);
                reader.unread(surrogatePair);
            } else {
                reader.unread(codePoint);
            }
        }
    }

    /**
     * Peeks at the next Unicode code point without advancing the reader, and
     * returns its value.
     * 
     * @return the next Unicode code point, or -1 if the end of the stream has
     *         been reached.
     * @throws IOException
     */
    protected int peekCodePoint() throws IOException {
        int result = readCodePoint();
        unread(result);
        return result;
    }

    protected void reportLocation() {
        reportLocation(getLineNumber(), -1);
    }

    /**
     * Overrides {@link RDFParserBase#reportWarning(String)}, adding line number
     * information to the error.
     */
    @Override
    protected void reportWarning(String msg) {
        reportWarning(msg, getLineNumber(), -1);
    }

    /**
     * Overrides {@link RDFParserBase#reportError(String, RioSetting)}, adding
     * line number information to the error.
     */
    @Override
    protected void reportError(String msg, RioSetting<Boolean> setting) throws RDFParseException {
        reportError(msg, getLineNumber(), -1, setting);
    }

    /**
     * Overrides {@link RDFParserBase#reportFatalError(String)}, adding line
     * number information to the error.
     */
    @Override
    protected void reportFatalError(String msg) throws RDFParseException {
        reportFatalError(msg, getLineNumber(), -1);
    }

    /**
     * Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line
     * number information to the error.
     */
    @Override
    protected void reportFatalError(Exception e) throws RDFParseException {
        reportFatalError(e, getLineNumber(), -1);
    }

    protected void throwEOFException() throws RDFParseException {
        throw new RDFParseException("Unexpected end of file");
    }

    private int getLineNumber() {
        return lineNumber;
    }
}