Java tutorial
/* * This file is licensed under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.nidhinova.tika.server; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import java.util.List; import java.util.Set; import javax.naming.InitialContext; import javax.naming.NamingException; import javax.ws.rs.Consumes; import javax.ws.rs.GET; import javax.ws.rs.PUT; import javax.ws.rs.Path; import javax.ws.rs.PathParam; import javax.ws.rs.Produces; import javax.ws.rs.WebApplicationException; import javax.ws.rs.core.Context; import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; import javax.ws.rs.core.StreamingOutput; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.poi.EncryptedDocumentException; import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.springframework.context.annotation.Scope; import org.springframework.stereotype.Component; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Tika as a HTTP service, returns metadata as json, textual content as plain * text Can be used by doing a PUT of the file we want to parse Can also be used * when the file is available locally at the Tika Server (using GET) * * @author github.com/gselva * */ @Path("/") @Component @Scope("request") public class TikaService { private final Log logger = LogFactory.getLog(TikaService.class); private static final String CONTENT_LENGTH = "Content-Length"; private static final String FILE_NNAME = "File-Name"; private static final String RESOURCE_NAME = "resourceName"; /** * Serves HTTP GET Returns metadata formatted as json or plain text content * of the file. File should be locally accessible for Tika Server using * pathkey JNDI * * @param filename * @param pathkey * (JNDI lookup key) * @param opkey * (can be "text" or "metadata" or "fulldata") * @param httpHeaders * @return * @throws Exception */ @GET @Produces({ MediaType.APPLICATION_JSON }) @Path("/{opkey}/{pathkey}/{resourceid: .*}") public StreamingOutput getMetadata(@javax.ws.rs.core.Context javax.ws.rs.core.UriInfo uriInfo, @PathParam("opkey") final String opkey, @PathParam("pathkey") final String pathkey, @PathParam("resourceid") final String resourceId, @Context HttpHeaders httpHeaders) throws Exception { // get the resource segment, this may have query params // we are ok with it as long as we can get something at that location String[] segments = uriInfo.getRequestUri().toASCIIString().split("/" + opkey + "/" + pathkey + "/"); final String filename = segments[segments.length - 1]; logger.info("resource :" + segments[segments.length - 1]); final Detector detector = createDetector(httpHeaders); final AutoDetectParser parser = new AutoDetectParser(detector); final ParseContext context = new ParseContext(); context.set(Parser.class, parser); final org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata(); setMetadataFromHeader(parser, metadata, httpHeaders); URL url = null; try { if (pathkey != null && resourceId != null) { String filepath = getFilePath(pathkey) + filename; File file = new File(filepath); if (file.isFile()) { url = file.toURI().toURL(); } else { url = new URL(filepath); } } } catch (MalformedURLException mex) { throw new WebApplicationException(Response.Status.NOT_FOUND); } final InputStream is = TikaInputStream.get(url, metadata); return new StreamingOutput() { public void write(OutputStream outputStream) throws IOException, WebApplicationException { StringWriter textBuffer = new StringWriter(); ContentHandler handler = null; if (opkey.equalsIgnoreCase("metadata")) { handler = new DefaultHandler(); } else if (opkey.equalsIgnoreCase("text") || opkey.equalsIgnoreCase("fulldata")) { handler = new BodyContentHandler(textBuffer); } try { parser.parse(is, handler, metadata, context); String contentEncoding = (metadata .get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING) == null ? "UTF-8" : metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING)); logger.info("Content encoding: " + metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING)); Writer outWriter = getOutputWriter(outputStream, contentEncoding); //metadata is always gathered // munch tika metadata object it to make json String jsonMetadata = JSONHelper.metadataToJson(metadata); if (opkey.equalsIgnoreCase("metadata")) { outWriter.write("{\"metadata\":" + jsonMetadata + "}"); } else if (opkey.equalsIgnoreCase("text")) { // write it out outWriter.write("{ \"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }"); } else if (opkey.equalsIgnoreCase("fulldata")) { StringBuilder data = new StringBuilder(); data.append("{ \"metadata\":" + jsonMetadata).append(", ") .append("\"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }"); outWriter.write(data.toString()); } outWriter.flush(); } catch (SAXException e) { throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR); } catch (TikaException e) { if (e.getCause() != null && e.getCause() instanceof WebApplicationException) { throw (WebApplicationException) e.getCause(); } if (e.getCause() != null && e.getCause() instanceof IllegalStateException) { throw new WebApplicationException(Response.status(422).build()); } if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) { throw new WebApplicationException(Response.status(422).build()); } if (e.getCause() != null && e.getCause() instanceof OldWordFileFormatException) { throw new WebApplicationException(Response.status(422).build()); } logger.warn("Text extraction failed", e); throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR); } } }; } /** * Serves HTTP PUT Returns metadata formatted as json or plain text content * of the file * * @param filename * @param pathkey * (JNDI lookup key) * @param opkey * (can be "text" or "metadata") * @param httpHeaders * @return * @throws Exception */ @PUT @Consumes("*/*") @Produces({ MediaType.APPLICATION_JSON }) @Path("/{opkey}") public StreamingOutput getMetadata(final InputStream is, @PathParam("opkey") final String opkey, @Context HttpHeaders httpHeaders) throws Exception { final Detector detector = createDetector(httpHeaders); final AutoDetectParser parser = new AutoDetectParser(detector); final ParseContext context = new ParseContext(); context.set(Parser.class, parser); final org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata(); setMetadataFromHeader(parser, metadata, httpHeaders); return new StreamingOutput() { public void write(OutputStream outputStream) throws IOException, WebApplicationException { StringWriter textBuffer = new StringWriter(); ContentHandler handler = null; if (opkey.equalsIgnoreCase("metadata")) { handler = new DefaultHandler(); } else if (opkey.equalsIgnoreCase("text") || opkey.equalsIgnoreCase("fulldata")) { handler = new BodyContentHandler(textBuffer); } try { parser.parse(new BufferedInputStream(is), handler, metadata, context); String contentEncoding = (metadata .get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE) == null ? "UTF-8" : metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE)); Writer outWriter = getOutputWriter(outputStream, contentEncoding); //metadata is always gathered // munch tika metadata object it to make json String jsonMetadata = JSONHelper.metadataToJson(metadata); if (opkey.equalsIgnoreCase("metadata")) { outWriter.write("{\"metadata\":" + jsonMetadata + "}"); } else if (opkey.equalsIgnoreCase("text")) { // write it out outWriter.write("{ \"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }"); } else if (opkey.equalsIgnoreCase("fulldata")) { StringBuilder data = new StringBuilder(); data.append("{ \"metadata\":" + jsonMetadata).append(", ") .append("\"text\":" + JSONHelper.toJSON(textBuffer.toString()) + " }"); outWriter.write(data.toString()); } outWriter.flush(); } catch (SAXException e) { throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR); } catch (TikaException e) { if (e.getCause() != null && e.getCause() instanceof WebApplicationException) { throw (WebApplicationException) e.getCause(); } if (e.getCause() != null && e.getCause() instanceof IllegalStateException) { throw new WebApplicationException(Response.status(422).build()); } if (e.getCause() != null && e.getCause() instanceof EncryptedDocumentException) { throw new WebApplicationException(Response.status(422).build()); } if (e.getCause() != null && e.getCause() instanceof OldWordFileFormatException) { throw new WebApplicationException(Response.status(422).build()); } logger.warn("Text extraction failed", e); throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR); } } }; } /** * Creates a AutoDetectParser * * @return */ public static AutoDetectParser createParser() { final AutoDetectParser parser = new AutoDetectParser(); parser.setFallback(new Parser() { public Set<org.apache.tika.mime.MediaType> getSupportedTypes(ParseContext parseContext) { return parser.getSupportedTypes(parseContext); } public void parse(InputStream inputStream, ContentHandler contentHandler, org.apache.tika.metadata.Metadata metadata, ParseContext parseContext) { throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE); } public void parse(InputStream inputStream, ContentHandler contentHandler, org.apache.tika.metadata.Metadata metadata) { throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE); } }); return parser; } /** * Set possible metadata from http headers * * @param parser * @param metadata * @param httpHeaders */ public void setMetadataFromHeader(AutoDetectParser parser, org.apache.tika.metadata.Metadata metadata, HttpHeaders httpHeaders) { javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType(); final List<String> fileName = httpHeaders.getRequestHeader(FILE_NNAME), cl = httpHeaders.getRequestHeader(CONTENT_LENGTH); if (cl != null && !cl.isEmpty()) metadata.set(CONTENT_LENGTH, cl.get(0)); if (fileName != null && !fileName.isEmpty()) metadata.set(RESOURCE_NAME, fileName.get(0)); if (mediaType != null && !mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) { metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString()); final Detector detector = parser.getDetector(); parser.setDetector(new Detector() { public org.apache.tika.mime.MediaType detect(InputStream inputStream, org.apache.tika.metadata.Metadata metadata) throws IOException { String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE); logger.info("Content type " + ct); if (ct != null) { return org.apache.tika.mime.MediaType.parse(ct); } else { return detector.detect(inputStream, metadata); } } }); } } public Detector createDetector(HttpHeaders httpHeaders) throws IOException, TikaException { final javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType(); if (mediaType == null || mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) return (new TikaConfig()).getMimeRepository(); else return new Detector() { public org.apache.tika.mime.MediaType detect(InputStream inputStream, org.apache.tika.metadata.Metadata metadata) throws IOException { return org.apache.tika.mime.MediaType.parse(mediaType.toString()); } }; } /** * Returns a output writer with the given encoding. * * @see <a * href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a> * @param output * output stream * @param encoding * output encoding, or <code>null</code> for the platform default * @return output writer * @throws UnsupportedEncodingException * if the given encoding is not supported */ private static Writer getOutputWriter(OutputStream output, String encoding) throws UnsupportedEncodingException { if (encoding != null) { return new OutputStreamWriter(output, encoding); } else if (System.getProperty("os.name").toLowerCase().startsWith("mac os x")) { // TIKA-324: Override the default encoding on Mac OS X return new OutputStreamWriter(output, "UTF-8"); } else { return new OutputStreamWriter(output); } } /** * Returns a URL for pathkey from JNDI. Used in calls that processes * network-accessible files where you don't want to expose the absolute path * Ensure pathkey is available in JNDI * * @return filepath */ private String getFilePath(String pathkey) { logger.info("Getting path for " + pathkey); String path = ""; try { javax.naming.Context initCtx = new InitialContext(); path = (String) initCtx.lookup("java:comp/env/" + pathkey); } catch (NamingException e) { e.printStackTrace(); } return path; } }