com.nidhinova.tika.server.TikaService.java Source code

Introduction

Here is the source code for com.nidhinova.tika.server.TikaService.java
Source

/*
 * This file is licensed under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nidhinova.tika.server;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import java.util.Set;

import javax.naming.InitialContext;
import javax.naming.NamingException;
import javax.ws.rs.Consumes;
import javax.ws.rs.GET;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
import javax.ws.rs.Produces;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.StreamingOutput;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Tika as a HTTP service, returns metadata as json, textual content as plain
 * text Can be used by doing a PUT of the file we want to parse Can also be used
 * when the file is available locally at the Tika Server (using GET)
 *
 * @author github.com/gselva
 * 
 */

@Path("/")
@Component
@Scope("request")
public class TikaService {
    private final Log logger = LogFactory.getLog(TikaService.class);
    private static final String CONTENT_LENGTH = "Content-Length";
    private static final String FILE_NNAME = "File-Name";
    private static final String RESOURCE_NAME = "resourceName";

    /**
     * Serves HTTP GET Returns metadata formatted as json or plain text content
     * of the file. File should be locally accessible for Tika Server using
     * pathkey JNDI
     * 
     * @param filename
     * @param pathkey
     *            (JNDI lookup key)
     * @param opkey
     *            (can be "text" or "metadata" or "fulldata")
     * @param httpHeaders
     * @return
     * @throws Exception
     */
    @GET
    @Produces({ MediaType.APPLICATION_JSON })
    @Path("/{opkey}/{pathkey}/{resourceid: .*}")
    public StreamingOutput getMetadata(@javax.ws.rs.core.Context javax.ws.rs.core.UriInfo uriInfo,
            @PathParam("opkey") final String opkey, @PathParam("pathkey") final String pathkey,
            @PathParam("resourceid") final String resourceId, @Context HttpHeaders httpHeaders) throws Exception {

        // get the resource segment, this may have query params
        // we are ok with it as long as we can get something at that location
        String[] segments = uriInfo.getRequestUri().toASCIIString().split("/" + opkey + "/" + pathkey + "/");
        final String filename = segments[segments.length - 1];
        logger.info("resource :" + segments[segments.length - 1]);

        final Detector detector = createDetector(httpHeaders);
        final AutoDetectParser parser = new AutoDetectParser(detector);
        final ParseContext context = new ParseContext();
        context.set(Parser.class, parser);
        final org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
        setMetadataFromHeader(parser, metadata, httpHeaders);

        URL url = null;
        try {
            if (pathkey != null && resourceId != null) {
                String filepath = getFilePath(pathkey) + filename;
                File file = new File(filepath);
                if (file.isFile()) {
                    url = file.toURI().toURL();
                } else {
                    url = new URL(filepath);
                }
            }
        } catch (MalformedURLException mex) {
            throw new WebApplicationException(Response.Status.NOT_FOUND);
        }

        final InputStream is = TikaInputStream.get(url, metadata);

        return new StreamingOutput() {
            public void write(OutputStream outputStream) throws IOException, WebApplicationException {

                StringWriter textBuffer = new StringWriter();
                ContentHandler handler = null;
                if (opkey.equalsIgnoreCase("metadata")) {
                    handler = new DefaultHandler();
                } else if (opkey.equalsIgnoreCase("text") || opkey.equalsIgnoreCase("fulldata")) {
                    handler = new BodyContentHandler(textBuffer);
                }
                try {

                    parser.parse(is, handler, metadata, context);

                    String contentEncoding = (metadata
                            .get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING) == null ? "UTF-8"
                                    : metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING));

                    logger.info("Content encoding: "
                            + metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING));

                    Writer outWriter = getOutputWriter(outputStream, contentEncoding);

                    //metadata is always gathered
                    // munch tika metadata object it to make json
                    String jsonMetadata = JSONHelper.metadataToJson(metadata);

                    if (opkey.equalsIgnoreCase("metadata")) {
                        outWriter.write("{\"metadata\":" + jsonMetadata + "}");
                    } else if (opkey.equalsIgnoreCase("text")) {
                        // write it out
                        outWriter.write("{ \"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                    } else if (opkey.equalsIgnoreCase("fulldata")) {
                        StringBuilder data = new StringBuilder();
                        data.append("{ \"metadata\":" + jsonMetadata).append(", ")
                                .append("\"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                        outWriter.write(data.toString());
                    }
                    outWriter.flush();
                } catch (SAXException e) {
                    throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
                } catch (TikaException e) {
                    if (e.getCause() != null && e.getCause() instanceof WebApplicationException) {
                        throw (WebApplicationException) e.getCause();
                    }

                    if (e.getCause() != null && e.getCause() instanceof IllegalStateException) {
                        throw new WebApplicationException(Response.status(422).build());
                    }

                    if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) {
                        throw new WebApplicationException(Response.status(422).build());
                    }

                    if (e.getCause() != null && e.getCause() instanceof OldWordFileFormatException) {
                        throw new WebApplicationException(Response.status(422).build());
                    }

                    logger.warn("Text extraction failed", e);

                    throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
                }
            }
        };

    }

    /**
     * Serves HTTP PUT Returns metadata formatted as json or plain text content
     * of the file
     * 
     * @param filename
     * @param pathkey
     *            (JNDI lookup key)
     * @param opkey
     *            (can be "text" or "metadata")
     * @param httpHeaders
     * @return
     * @throws Exception
     */

    @PUT
    @Consumes("*/*")
    @Produces({ MediaType.APPLICATION_JSON })
    @Path("/{opkey}")
    public StreamingOutput getMetadata(final InputStream is, @PathParam("opkey") final String opkey,
            @Context HttpHeaders httpHeaders) throws Exception {
        final Detector detector = createDetector(httpHeaders);
        final AutoDetectParser parser = new AutoDetectParser(detector);
        final ParseContext context = new ParseContext();
        context.set(Parser.class, parser);
        final org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
        setMetadataFromHeader(parser, metadata, httpHeaders);

        return new StreamingOutput() {
            public void write(OutputStream outputStream) throws IOException, WebApplicationException {

                StringWriter textBuffer = new StringWriter();

                ContentHandler handler = null;
                if (opkey.equalsIgnoreCase("metadata")) {
                    handler = new DefaultHandler();
                } else if (opkey.equalsIgnoreCase("text") || opkey.equalsIgnoreCase("fulldata")) {
                    handler = new BodyContentHandler(textBuffer);
                }
                try {
                    parser.parse(new BufferedInputStream(is), handler, metadata, context);
                    String contentEncoding = (metadata
                            .get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE) == null ? "UTF-8"
                                    : metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE));
                    Writer outWriter = getOutputWriter(outputStream, contentEncoding);

                    //metadata is always gathered
                    // munch tika metadata object it to make json
                    String jsonMetadata = JSONHelper.metadataToJson(metadata);

                    if (opkey.equalsIgnoreCase("metadata")) {
                        outWriter.write("{\"metadata\":" + jsonMetadata + "}");
                    } else if (opkey.equalsIgnoreCase("text")) {
                        // write it out
                        outWriter.write("{ \"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                    } else if (opkey.equalsIgnoreCase("fulldata")) {
                        StringBuilder data = new StringBuilder();
                        data.append("{ \"metadata\":" + jsonMetadata).append(", ")
                                .append("\"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }");
                        outWriter.write(data.toString());
                    }
                    outWriter.flush();
                } catch (SAXException e) {
                    throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
                } catch (TikaException e) {
                    if (e.getCause() != null && e.getCause() instanceof WebApplicationException) {
                        throw (WebApplicationException) e.getCause();
                    }

                    if (e.getCause() != null && e.getCause() instanceof IllegalStateException) {
                        throw new WebApplicationException(Response.status(422).build());
                    }

                    if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) {
                        throw new WebApplicationException(Response.status(422).build());
                    }

                    if (e.getCause() != null && e.getCause() instanceof OldWordFileFormatException) {
                        throw new WebApplicationException(Response.status(422).build());
                    }

                    logger.warn("Text extraction failed", e);

                    throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
                }
            }
        };

    }

    /**
     * Creates a AutoDetectParser
     * 
     * @return
     */
    public static AutoDetectParser createParser() {
        final AutoDetectParser parser = new AutoDetectParser();

        parser.setFallback(new Parser() {
            public Set<org.apache.tika.mime.MediaType> getSupportedTypes(ParseContext parseContext) {
                return parser.getSupportedTypes(parseContext);
            }

            public void parse(InputStream inputStream, ContentHandler contentHandler,
                    org.apache.tika.metadata.Metadata metadata, ParseContext parseContext) {
                throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
            }

            public void parse(InputStream inputStream, ContentHandler contentHandler,
                    org.apache.tika.metadata.Metadata metadata) {
                throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
            }
        });

        return parser;
    }

    /**
     * Set possible metadata from http headers
     * 
     * @param parser
     * @param metadata
     * @param httpHeaders
     */
    public void setMetadataFromHeader(AutoDetectParser parser, org.apache.tika.metadata.Metadata metadata,
            HttpHeaders httpHeaders) {
        javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();

        final List<String> fileName = httpHeaders.getRequestHeader(FILE_NNAME),
                cl = httpHeaders.getRequestHeader(CONTENT_LENGTH);
        if (cl != null && !cl.isEmpty())
            metadata.set(CONTENT_LENGTH, cl.get(0));

        if (fileName != null && !fileName.isEmpty())
            metadata.set(RESOURCE_NAME, fileName.get(0));

        if (mediaType != null && !mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
            metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());

            final Detector detector = parser.getDetector();

            parser.setDetector(new Detector() {
                public org.apache.tika.mime.MediaType detect(InputStream inputStream,
                        org.apache.tika.metadata.Metadata metadata) throws IOException {
                    String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
                    logger.info("Content type " + ct);
                    if (ct != null) {
                        return org.apache.tika.mime.MediaType.parse(ct);
                    } else {
                        return detector.detect(inputStream, metadata);
                    }
                }
            });
        }
    }

    public Detector createDetector(HttpHeaders httpHeaders) throws IOException, TikaException {
        final javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();
        if (mediaType == null || mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE))
            return (new TikaConfig()).getMimeRepository();
        else
            return new Detector() {

                public org.apache.tika.mime.MediaType detect(InputStream inputStream,
                        org.apache.tika.metadata.Metadata metadata) throws IOException {
                    return org.apache.tika.mime.MediaType.parse(mediaType.toString());
                }
            };
    }

    /**
     * Returns a output writer with the given encoding.
     * 
     * @see <a
     *      href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
     * @param output
     *            output stream
     * @param encoding
     *            output encoding, or <code>null</code> for the platform default
     * @return output writer
     * @throws UnsupportedEncodingException
     *             if the given encoding is not supported
     */
    private static Writer getOutputWriter(OutputStream output, String encoding)
            throws UnsupportedEncodingException {
        if (encoding != null) {
            return new OutputStreamWriter(output, encoding);
        } else if (System.getProperty("os.name").toLowerCase().startsWith("mac os x")) {
            // TIKA-324: Override the default encoding on Mac OS X
            return new OutputStreamWriter(output, "UTF-8");
        } else {
            return new OutputStreamWriter(output);
        }
    }

    /**
     * Returns a URL for pathkey from JNDI. Used in calls that processes
     * network-accessible files where you don't want to expose the absolute path
     * Ensure pathkey is available in JNDI
     * 
     * @return filepath
     */
    private String getFilePath(String pathkey) {
        logger.info("Getting path for " + pathkey);
        String path = "";
        try {
            javax.naming.Context initCtx = new InitialContext();
            path = (String) initCtx.lookup("java:comp/env/" + pathkey);
        } catch (NamingException e) {
            e.printStackTrace();
        }
        return path;
    }

}