org.apache.any23.servlet.Servlet.java Source code

Introduction

Here is the source code for org.apache.any23.servlet.Servlet.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.any23.servlet;

import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.servlet.conneg.Any23Negotiator;
import org.apache.any23.servlet.conneg.MediaRangeSpec;
import org.apache.any23.source.ByteArrayDocumentSource;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.commons.httpclient.URI;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.regex.Pattern;

import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;

/**
 * A <i>Servlet</i> that fetches a client-specified <i>IRI</i>,
 * RDFizes the content, and returns it in a format chosen by the client.
 *
 * @author Gabriele Renzi
 * @author Richard Cyganiak (richard@cyganiak.de)
 */
public class Servlet extends HttpServlet {

    private static final Logger LOG = LoggerFactory.getLogger(Servlet.class);

    public static final String DEFAULT_BASE_IRI = "http://any23.org/tmp/";

    private static final long serialVersionUID = 8207685628715421336L;

    // RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
    private final static Pattern schemeRegex = Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");

    @Override
    protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
        final WebResponder responder = new WebResponder(this, resp);
        final String format = getFormatFromRequestOrNegotiation(req);
        final boolean report = isReport(req);
        final boolean annotate = isAnnotated(req);
        if (format == null) {
            responder.sendError(406, "Client accept header does not include a supported output format", report);
            return;
        }
        final String uri = getInputIRIFromRequest(req);
        if (uri == null) {
            responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report);
            return;
        }
        final ExtractionParameters eps = getExtractionParameters(req);
        responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
    }

    @Override
    protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
        final WebResponder responder = new WebResponder(this, resp);
        final boolean report = isReport(req);
        final boolean annotate = isAnnotated(req);
        if (req.getContentType() == null) {
            responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified",
                    report);
            return;
        }
        final String uri = getInputIRIFromRequest(req);
        final String format = getFormatFromRequestOrNegotiation(req);
        if (format == null) {
            responder.sendError(406, "Client accept header does not include a supported output format", report);
            return;
        }
        final ExtractionParameters eps = getExtractionParameters(req);
        if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
            if (uri != null) {
                log("Attempting conversion to '" + format + "' from IRI <" + uri + ">");
                responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report,
                        annotate);
                return;
            }
            if (req.getParameter("body") == null) {
                responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
                return;
            }
            String type = null;
            if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
                type = req.getParameter("type");
            }
            log("Attempting conversion to '" + format + "' from body parameter");
            responder.runExtraction(
                    new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_IRI, type), eps, format,
                    report, annotate);
            return;
        }
        log("Attempting conversion to '" + format + "' from POST body");
        responder.runExtraction(new ByteArrayDocumentSource(req.getInputStream(), Servlet.DEFAULT_BASE_IRI,
                getContentTypeHeader(req)), eps, format, report, annotate);
    }

    private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
        String fromRequest = getFormatFromRequest(request);
        if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
            return fromRequest;
        }
        MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
        if (result == null) {
            return null;
        } else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
            return "turtle";
        } else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
            return "n3";
        } else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
            return "nq";
        } else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
            return "rdf";
        } else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
            return "nt";
        } else if (RDFFormat.JSONLD.hasMIMEType(result.getMediaType())) {
            return "ld+json";
        } else {
            return "turtle"; // shouldn't happen
        }
    }

    private String getFormatFromRequest(HttpServletRequest request) {
        if (request.getPathInfo() == null)
            return "best";
        String[] args = request.getPathInfo().split("/", 3);
        if (args.length < 2 || "".equals(args[1])) {
            if (request.getParameter("format") == null) {
                return "best";
            } else {
                return request.getParameter("format");
            }
        }
        return args[1];
    }

    private String getInputIRIFromRequest(HttpServletRequest request) {
        if (request.getPathInfo() == null)
            return null;
        String[] args = request.getPathInfo().split("/", 3);
        if (args.length < 3) {
            if (request.getParameter("uri") != null) {
                return request.getParameter("uri").trim();
            }
            if (request.getParameter("url") != null) {
                return request.getParameter("url").trim();
            }
            return null;
        }
        String uri = args[2];
        if (request.getQueryString() != null) {
            uri = uri + "?" + request.getQueryString();
        }
        if (!hasScheme(uri)) {
            uri = "http://" + uri;
        } else if (hasOnlySingleSlashAfterScheme(uri)) {
            // This is to work around an issue where Tomcat 6.0.18 is
            // too smart for us. Tomcat normalizes double-slashes in
            // the path, and thus turns "http://" into "http:/" if it
            // occurs in the path. So we restore the double slash.
            uri = uri.replaceFirst(":/", "://");
        }
        return uri.trim();
    }

    private boolean hasScheme(String uri) {
        return schemeRegex.matcher(uri).find();
    }

    private final static Pattern schemeAndSingleSlashRegex = Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");

    private boolean hasOnlySingleSlashAfterScheme(String uri) {
        return schemeAndSingleSlashRegex.matcher(uri).find();
    }

    private String getContentTypeHeader(HttpServletRequest req) {
        if (req.getHeader("Content-Type") == null)
            return null;
        if ("".equals(req.getHeader("Content-Type")))
            return null;
        String contentType = req.getHeader("Content-Type");
        // strip off parameters such as ";charset=UTF-8"
        int index = contentType.indexOf(";");
        if (index == -1)
            return contentType;
        return contentType.substring(0, index);
    }

    private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
            throws IOException {
        try {
            if (!isValidIRI(uri)) {
                throw new URISyntaxException(uri, "@@@");
            }
            return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
        } catch (URISyntaxException ex) {
            LOG.error("Invalid IRI detected", ex);
            responder.sendError(400, "Invalid input IRI " + uri, report);
            return null;
        }
    }

    protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
            throws IOException, URISyntaxException {
        return new HTTPDocumentSource(httpClient, uri);
    }

    private boolean isValidIRI(String s) {
        try {
            URI uri = new URI(s, false);
            if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
                return false;
            }
        } catch (Exception e) {
            return false;
        }
        return true;
    }

    private ValidationMode getValidationMode(HttpServletRequest request) {
        final String PARAMETER = "validation-mode";
        final String validationMode = request.getParameter(PARAMETER);
        if (validationMode == null)
            return ValidationMode.None;
        if ("none".equalsIgnoreCase(validationMode))
            return ValidationMode.None;
        if ("validate".equalsIgnoreCase(validationMode))
            return ValidationMode.Validate;
        if ("validate-fix".equalsIgnoreCase(validationMode))
            return ValidationMode.ValidateAndFix;
        throw new IllegalArgumentException(
                String.format("Invalid value '%s' for '%s' parameter.", validationMode, PARAMETER));
    }

    private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
        final ValidationMode mode = getValidationMode(request);
        return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
    }

    private boolean isReport(HttpServletRequest request) {
        return request.getParameter("report") != null;
    }

    private boolean isAnnotated(HttpServletRequest request) {
        return request.getParameter("annotate") != null;
    }

}