Java tutorial
/* * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.mapper.attachment; import static org.elasticsearch.index.mapper.MapperBuilders.dateField; import static org.elasticsearch.index.mapper.MapperBuilders.stringField; import static org.elasticsearch.index.mapper.MapperBuilders.longField; import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType; import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.lang.reflect.Field; import java.security.NoSuchAlgorithmException; import java.util.HashMap; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import org.apache.commons.io.output.TeeOutputStream; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.LongField; import org.apache.tika.exception.TikaException; import org.apache.tika.io.IOUtils; import org.apache.tika.metadata.Metadata; import org.elasticsearch.common.io.stream.BytesStreamInput; import org.elasticsearch.common.jackson.core.JsonParser; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLoggerFactory; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.json.JsonXContentParser; import org.elasticsearch.index.mapper.ContentPath; import org.elasticsearch.index.mapper.FieldMapperListener; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.MergeContext; import org.elasticsearch.index.mapper.MergeMappingException; import org.elasticsearch.index.mapper.ObjectMapperListener; import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.mapper.attachment.AttachmentMapper.CalcualteChecksumResult; import org.elasticsearch.index.mapper.attachment.AttachmentMapper.ParseResult; import org.elasticsearch.index.mapper.core.DateFieldMapper; import org.elasticsearch.index.mapper.core.LongFieldMapper; import org.elasticsearch.index.mapper.core.StringFieldMapper; import ucar.unidata.util.StringUtil; import vincent.FileMeta; import vincent.SHA1Calculator; /** * <pre> * field1 : "..." * </pre> * <p> * Or: * * <pre> * { * file1 : { * _content_type : "application/pdf", * _content_length : "500000000", * _name : "..../something.pdf", * content : "" * } * } * </pre> * <p/> * _content_length = Specify the maximum amount of characters to extract from * the attachment. If not specified, then the default for tika is 100,000 * characters. Caution is required when setting large values as this can cause * memory issues. */ public class AttachmentMapper implements Mapper { //not working in test cases final static ESLogger logger = ESLoggerFactory.getLogger("vincent-attachment"); final static boolean DEFAULT_USE_SYNC_HASHING = true; public static final String CONTENT_TYPE = "attachment"; public static class Defaults { public static final ContentPath.Type PATH_TYPE = ContentPath.Type.FULL; } public static class Builder extends Mapper.Builder<Builder, AttachmentMapper> { private ContentPath.Type pathType = Defaults.PATH_TYPE; // default builder private Integer defaultIndexedChars = null; private StringFieldMapper.Builder contentBuilder; private StringFieldMapper.Builder titleBuilder = stringField("title"); private StringFieldMapper.Builder nameBuilder = stringField("name"); private StringFieldMapper.Builder authorBuilder = stringField("author"); private StringFieldMapper.Builder keywordsBuilder = stringField("keywords"); private DateFieldMapper.Builder dateBuilder = dateField("date"); private StringFieldMapper.Builder contentTypeBuilder = stringField("content_type"); public Builder(String name) { super(name); this.builder = this; this.contentBuilder = stringField(name); } public Builder pathType(ContentPath.Type pathType) { this.pathType = pathType; return this; } // override if specified in mapping public Builder defaultIndexedChars(int defaultIndexedChars) { this.defaultIndexedChars = defaultIndexedChars; return this; } public Builder content(StringFieldMapper.Builder content) { this.contentBuilder = content; return this; } public Builder date(DateFieldMapper.Builder date) { this.dateBuilder = date; return this; } public Builder author(StringFieldMapper.Builder author) { this.authorBuilder = author; return this; } public Builder title(StringFieldMapper.Builder title) { this.titleBuilder = title; return this; } public Builder name(StringFieldMapper.Builder name) { this.nameBuilder = name; return this; } public Builder keywords(StringFieldMapper.Builder keywords) { this.keywordsBuilder = keywords; return this; } public Builder contentType(StringFieldMapper.Builder contentType) { this.contentTypeBuilder = contentType; return this; } @Override public AttachmentMapper build(BuilderContext context) { ContentPath.Type origPathType = context.path().pathType(); context.path().pathType(pathType); // create the content mapper under the actual name StringFieldMapper contentMapper = contentBuilder.build(context); // create the DC one under the name context.path().add(name); DateFieldMapper dateMapper = dateBuilder.ignoreMalformed(true).build(context); StringFieldMapper authorMapper = authorBuilder.build(context); StringFieldMapper titleMapper = titleBuilder.build(context); StringFieldMapper nameMapper = nameBuilder.store(true).includeInAll(true).build(context); StringFieldMapper keywordsMapper = keywordsBuilder.store(true).includeInAll(true).build(context); StringFieldMapper contentTypeMapper = contentTypeBuilder.build(context); // TODO encapsulate inside the mapper, pass context. pass to builder // instead ImageExifTikaMetaMapper imageExifTikaMetaMapper = new ImageExifTikaMetaMapper.Builder("image_exif") .build(context); // issue: under file.model, not image_exif.model StringFieldMapper checksumMapper = stringField("checksum").store(true).build(context); // .ignoreMalformed(true)/ LongFieldMapper checksumTookMapper = longField("checksumTook").store(true).build(context); LongFieldMapper parseTookMapper = longField("parseTook").store(true).build(context); FileMetaMapper fileMetaMapper = new FileMetaMapper("file_meta_post_parse", checksumMapper, checksumTookMapper, parseTookMapper); context.path().remove(); context.path().pathType(origPathType); int DEFAULT_INDEXED_CHARS = 5000000; if (defaultIndexedChars != null && context.indexSettings() != null) { defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", DEFAULT_INDEXED_CHARS); } if (defaultIndexedChars == null) { defaultIndexedChars = DEFAULT_INDEXED_CHARS; } return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, imageExifTikaMetaMapper, fileMetaMapper); } } /** * <pre> * field1 : { type : "attachment" } * </pre> * * Or: * * <pre> * field1 : { * type : "attachment", * fields : { * field1 : {type : "binary"}, * title : {store : "yes"}, * date : {store : "yes"} * } * } * </pre> */ public static class TypeParser implements Mapper.TypeParser { @SuppressWarnings({ "unchecked" }) @Override public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException { AttachmentMapper.Builder builder = new AttachmentMapper.Builder(name); System.out.println("Type Parser"); for (Map.Entry<String, Object> entry : node.entrySet()) { String fieldName = entry.getKey(); Object fieldNode = entry.getValue(); if (fieldName.equals("path")) { builder.pathType(parsePathType(name, fieldNode.toString())); } else if (fieldName.equals("fields")) { Map<String, Object> fieldsNode = (Map<String, Object>) fieldNode; for (Map.Entry<String, Object> entry1 : fieldsNode.entrySet()) { String propName = entry1.getKey(); Object propNode = entry1.getValue(); if (name.equals(propName)) {// name is properties type // name that with type // attachment i.e. field1 // that is the content builder.content((StringFieldMapper.Builder) parserContext.typeParser("string") .parse(name, (Map<String, Object>) propNode, parserContext)); } else if ("date".equals(propName)) { builder.date((DateFieldMapper.Builder) parserContext.typeParser("date").parse("date", (Map<String, Object>) propNode, parserContext)); } else if ("title".equals(propName)) { builder.title((StringFieldMapper.Builder) parserContext.typeParser("string") .parse("title", (Map<String, Object>) propNode, parserContext)); } else if ("name".equals(propName)) { builder.name((StringFieldMapper.Builder) parserContext.typeParser("string") .parse("name", (Map<String, Object>) propNode, parserContext)); } else if ("author".equals(propName)) { builder.author((StringFieldMapper.Builder) parserContext.typeParser("string") .parse("author", (Map<String, Object>) propNode, parserContext)); } else if ("keywords".equals(propName)) { builder.keywords((StringFieldMapper.Builder) parserContext.typeParser("string") .parse("keywords", (Map<String, Object>) propNode, parserContext)); } else if ("content_type".equals(propName)) { builder.contentType((StringFieldMapper.Builder) parserContext.typeParser("string") .parse("content_type", (Map<String, Object>) propNode, parserContext)); } } } } return builder; } } private final String name; private final ContentPath.Type pathType; private final int defaultIndexedChars; private final StringFieldMapper contentMapper; private final DateFieldMapper dateMapper; private final StringFieldMapper authorMapper; private final StringFieldMapper titleMapper; private final StringFieldMapper nameMapper; private final StringFieldMapper keywordsMapper; private final StringFieldMapper contentTypeMapper; // private final StringFieldMapper Mapper; // added mapper private final ImageExifTikaMetaMapper imageExifTikaMetaMapper; private final FileMetaMapper fileMetaMapper; public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, StringFieldMapper contentMapper, DateFieldMapper dateMapper, StringFieldMapper titleMapper, StringFieldMapper nameMapper, StringFieldMapper authorMapper, StringFieldMapper keywordsMapper, StringFieldMapper contentTypeMapper, ImageExifTikaMetaMapper imageExifTikaMetaMapper, FileMetaMapper fileMetaMapper) { this.name = name; this.pathType = pathType; this.defaultIndexedChars = defaultIndexedChars; this.contentMapper = contentMapper; this.dateMapper = dateMapper; this.titleMapper = titleMapper; this.nameMapper = nameMapper; this.authorMapper = authorMapper; this.keywordsMapper = keywordsMapper; this.contentTypeMapper = contentTypeMapper; // added vincent this.imageExifTikaMetaMapper = imageExifTikaMetaMapper; this.fileMetaMapper = fileMetaMapper; this.calculateChecksum = DEFAULT_USE_SYNC_HASHING; } private final boolean calculateChecksum; @Override public String name() { return name; } public boolean isString(XContentParser.Token token) { return token == XContentParser.Token.VALUE_STRING; } @Override public void parse(ParseContext context) throws IOException { System.out.println("Parse Index Request"); // byte[] content = null; String contentType = null; int indexedChars = defaultIndexedChars; String name = null; Map fieldMapping = new HashMap<String, Object>(); XContentParser parser = context.parser(); //create reference //TODO Map<String, Object> parseAndChecksumResults = null; XContentParser.Token token = parser.currentToken(); try { if (token == XContentParser.Token.VALUE_STRING || token == XContentParser.Token.VALUE_EMBEDDED_OBJECT) { parseAndChecksumResults = parseAndCalculateChecksumWithThreads(parser, indexedChars); } else { String currentFieldName = null; while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { currentFieldName = parser.currentName(); System.out.println(currentFieldName); } else { if ("content".equals(currentFieldName)) { //for both smile and string parseAndChecksumResults = parseAndCalculateChecksumWithThreads(parser, indexedChars); } else if (isString(token) && "_content_type".equals(currentFieldName)) { contentType = parser.text(); } else if (isString(token) && "_name".equals(currentFieldName)) { name = parser.text(); } else if (token == XContentParser.Token.VALUE_NUMBER && ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName))) { indexedChars = parser.intValue(); } else { logger.info("non-default mapping:" + currentFieldName); if ("content".equals(currentFieldName)) { } else { Object object = parser.objectText(); System.out.println(object); // content = parser.binaryValue(); fieldMapping.put(currentFieldName, object); } } } // Handle the mapping when doc come logger.info("fieldMapping" + fieldMapping); } } } catch (SecurityException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (NoSuchFieldException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IllegalArgumentException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IllegalAccessException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ExecutionException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (TimeoutException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (parseAndChecksumResults == null) { throw new IOException("parse failed, result is null"); } CalcualteChecksumResult checksumResult = (CalcualteChecksumResult) parseAndChecksumResults .get("checksumResult"); ParseResult parseResult = (ParseResult) parseAndChecksumResults.get("parseResult"); Metadata metadata = parseResult.metadata; if (contentType != null) { metadata.add(Metadata.CONTENT_TYPE, contentType); } if (name != null) { metadata.add(Metadata.RESOURCE_NAME_KEY, name); } // used same interface for image / non-image as decouple detection // logic to detect done by tika on the fly. but check aftewards // logger.info("parsedContent" + parsedContent); if (isImage(metadata)) { System.out.println(metadata); } else { if (parseResult.parseContent.equalsIgnoreCase("")) { logger.info("Content is empty after parsed by Tika"); System.out.println(metadata); try { throw new TikaException("Content is empty after parsed by Tika"); } catch (TikaException e) { // TODO Auto-generated catch block throw new IOException(e); } } } context.externalValue(parseResult.parseContent); contentMapper.parse(context); context.externalValue(name); nameMapper.parse(context); context.externalValue(metadata.get(Metadata.DATE)); dateMapper.parse(context); context.externalValue(metadata.get(Metadata.TITLE)); titleMapper.parse(context); context.externalValue(metadata.get(Metadata.AUTHOR)); authorMapper.parse(context); context.externalValue("ImKeyWord"); keywordsMapper.parse(context); context.externalValue(metadata.get(Metadata.CONTENT_TYPE)); contentTypeMapper.parse(context); context.externalValue(metadata); imageExifTikaMetaMapper.parse(context); FileMeta fileMeta = new FileMeta(); fileMeta.setChecksum(checksumResult.checksum); fileMeta.setChecksumTook(checksumResult.took); fileMeta.setParseTook(parseResult.took); context.externalValue(fileMeta); fileMetaMapper.parse(context); } private Map<String, Object> parseAndCalculateChecksumWithThreads(XContentParser parser, int indexedChars) throws SecurityException, IllegalAccessException, NoSuchFieldException, IOException, InterruptedException, ExecutionException, TimeoutException { Map<String, Object> resultMap = new HashMap<String, Object>(); Metadata metadata = new Metadata(); JsonParser jsonParser = getInternalJsonParser(parser); PipedInputStream pipedIs = new PipedInputStream(); PipedOutputStream pipedOs = new PipedOutputStream(pipedIs); PipedInputStream pipedIs2 = new PipedInputStream(); PipedOutputStream pipedOs2 = new PipedOutputStream(pipedIs2); ExecutorService pool = Executors.newFixedThreadPool(2); Future future = pool.submit(new ParsingThread(pipedIs, metadata, indexedChars)); Future checksumFuture = null; if (calculateChecksum) { checksumFuture = pool.submit(new CalcualteChecksumThread(pipedIs2)); } TeeOutputStream tos = new TeeOutputStream(pipedOs, pipedOs2); int readBinaryValue = jsonParser.readBinaryValue(tos); // tee stream perhaps IOUtils.closeQuietly(tos); IOUtils.closeQuietly(pipedOs); IOUtils.closeQuietly(pipedOs2); System.out.println("main thread finish read" + readBinaryValue); ParseResult parseResult = (ParseResult) future.get(10 * 100, TimeUnit.SECONDS); CalcualteChecksumResult checksumResult = null; if (calculateChecksum && checksumFuture != null) { checksumResult = (CalcualteChecksumResult) checksumFuture.get(10 * 100, TimeUnit.SECONDS); System.out.println(checksumResult.checksum); } System.out.println("parseResult"); metadata = parseResult.metadata; // although metadata is reference, better return and use for easier // refactoring laters System.out.println(metadata); System.out.println("Thread join"); pool.shutdown(); pool.awaitTermination(10 * 100, TimeUnit.SECONDS); //TODO align static class and map resultMap.put("parseResult", parseResult); resultMap.put("checksumResult", checksumResult); return resultMap; } /** * @param parser * @param field * @return * @throws IllegalAccessException * @throws NoSuchFieldException * @throws SecurityException */ private JsonParser getInternalJsonParser(XContentParser parser) throws IllegalAccessException, SecurityException, NoSuchFieldException { Field field = JsonXContentParser.class.getDeclaredField("parser"); field.setAccessible(true); JsonParser jsonParser = (JsonParser) field.get(parser); return jsonParser; } /** * @param metadata */ public static boolean isImage(Metadata metadata) { String typeAfterDetection = metadata.get(Metadata.CONTENT_TYPE); return StringUtil.notEmpty(typeAfterDetection) && typeAfterDetection.startsWith("image"); } @Override public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException { // ignore this for now } @Override public void traverse(FieldMapperListener fieldMapperListener) { contentMapper.traverse(fieldMapperListener); dateMapper.traverse(fieldMapperListener); titleMapper.traverse(fieldMapperListener); nameMapper.traverse(fieldMapperListener); authorMapper.traverse(fieldMapperListener); keywordsMapper.traverse(fieldMapperListener); contentTypeMapper.traverse(fieldMapperListener); imageExifTikaMetaMapper.traverse(fieldMapperListener); fileMetaMapper.traverse(fieldMapperListener); } @Override public void traverse(ObjectMapperListener objectMapperListener) { } @Override public void close() { contentMapper.close(); dateMapper.close(); titleMapper.close(); nameMapper.close(); authorMapper.close(); keywordsMapper.close(); contentTypeMapper.close(); imageExifTikaMetaMapper.close(); fileMetaMapper.close(); } @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { // this is for mapping staage? builder.startObject(name); builder.field("type", CONTENT_TYPE); builder.field("path", pathType.name().toLowerCase()); builder.startObject("fields"); contentMapper.toXContent(builder, params); authorMapper.toXContent(builder, params); titleMapper.toXContent(builder, params); nameMapper.toXContent(builder, params); dateMapper.toXContent(builder, params); keywordsMapper.toXContent(builder, params); contentTypeMapper.toXContent(builder, params); // output is controlled here instead of builder? System.out.println("toXcontent for imageExif"); imageExifTikaMetaMapper.toXContent(builder, params); builder.endObject(); builder.endObject(); // the builder build the mapping // builder.string is stateful->will clear it and cause errror // logger.info(builder.string()); return builder; } public static class ParseResult { Metadata metadata; String parseContent; long took; ParseResult(Metadata metadata, String parseContent, long took) { this.metadata = metadata; this.parseContent = parseContent; this.took = took; } @Override public String toString() { return "ParseResult [metadata=" + metadata + ", parseContent=" + parseContent + ", took=" + took + "]"; } } public static class CalcualteChecksumResult { String checksum; long took; CalcualteChecksumResult(String checksum, long took) { this.checksum = checksum; this.took = took; } @Override public String toString() { return "CalcualteChecksumResult [checksum=" + checksum + ", took=" + took + "]"; } } public static class CalcualteChecksumThread implements Callable { InputStream is = null; CalcualteChecksumThread(InputStream is) { this.is = is; } @Override public Object call() throws Exception { System.out.println("Calculate Checksum"); long calculatedChecksumStart = System.currentTimeMillis(); long calculateChecksumTook = 0; String calculatedChecksum = ""; try { calculatedChecksum = SHA1Calculator.calculateChecksum(is); calculateChecksumTook = System.currentTimeMillis() - calculatedChecksumStart; System.out.println(calculateChecksumTook); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } System.out.println("check sum done"); // content IOUtils.closeQuietly(is); return new CalcualteChecksumResult(calculatedChecksum, calculateChecksumTook); } } public static class ParsingThread implements Callable { InputStream is = null; Metadata metadata = null; int indexedChars = 0; ParsingThread(InputStream is, Metadata metadata, int indexedChars) { this.is = is; this.metadata = metadata; this.indexedChars = indexedChars; } // // @Override // public void run() { // byte[] bytes = new byte[1024]; // int read=0; // try { // while ((read = (is.read(bytes))) != -1) { // System.out.println("another read:" + read); // } // ; // } catch (IOException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // // } // // System.out.println("is close"); // // IOUtils.closeQuietly(is); // // } @Override public Object call() throws Exception { System.out.println("Start to parse"); // String parsedContent = tika() // .parseToString(is, metadata, indexedChars); long parseContentStart = System.currentTimeMillis(); String parsedContent = tika().parseToString(is, metadata, indexedChars); // System.out.println("parse completed"); IOUtils.closeQuietly(is); long took = System.currentTimeMillis() - parseContentStart; ParseResult parseResult = new ParseResult(metadata, parsedContent, took); // TODO Auto-generated method stub return parseResult; } } }