Java tutorial
/* * * Copyright (c) 2012-2016 "FlockData LLC" * * This file is part of FlockData. * * FlockData is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * FlockData is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FlockData. If not, see <http://www.gnu.org/licenses/>. */ package org.flockdata.integration; import au.com.bytecode.opencsv.CSVReader; import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.type.CollectionType; import com.fasterxml.jackson.databind.type.TypeFactory; import org.apache.commons.io.filefilter.WildcardFileFilter; import org.flockdata.helper.FdJsonObjectMapper; import org.flockdata.helper.FlockException; import org.flockdata.helper.NotFoundException; import org.flockdata.profile.model.ContentModel; import org.flockdata.profile.model.ExtractProfile; import org.flockdata.registration.TagInputBean; import org.flockdata.track.bean.EntityInputBean; import org.flockdata.track.bean.EntityToEntityLinkInput; import org.flockdata.transform.TransformationHelper; import org.flockdata.transform.Transformer; import org.flockdata.transform.xml.XmlMappable; import org.joda.time.DateTime; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Configuration; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.UrlResource; import org.springframework.expression.ExpressionParser; import org.springframework.expression.spel.standard.SpelExpressionParser; import org.springframework.expression.spel.support.StandardEvaluationContext; import org.springframework.util.StopWatch; import javax.xml.bind.JAXBException; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.transform.stream.StreamSource; import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; /** * @author mholdsworth * @since 7/10/2014 * @tag Integration, File, DelimitedFile, FdClient */ @Configuration public class FileProcessor { private static final org.slf4j.Logger logger = LoggerFactory.getLogger(FileProcessor.class); private static final DecimalFormat formatter = new DecimalFormat(); private static final ExpressionParser parser = new SpelExpressionParser(); static StandardEvaluationContext context = new StandardEvaluationContext(); private PayloadWriter payloadWriter; private long skipCount, rowsToProcess = 0; public FileProcessor() { } public FileProcessor(int skipCount, int rowsToProcess) { this.skipCount = skipCount; this.rowsToProcess = rowsToProcess; } @Autowired(required = false) public FileProcessor(PayloadWriter payloadWriter) { this.payloadWriter = payloadWriter; } private static String[] preProcess(String[] row, ExtractProfile extractProfile) { String[] result = new String[row.length]; String exp = extractProfile.getPreParseRowExp(); if ((exp == null || exp.equals(""))) return row; int i = 0; for (String column : row) { Object value = evaluateExpression(column, exp); result[i] = value.toString(); i++; } return result; } private static Object evaluateExpression(Object value, String expression) { context.setVariable("value", value); return parser.parseExpression(expression).getValue(context); } public static Reader getReader(String file) throws NotFoundException { InputStream stream = ClassLoader.class.getResourceAsStream(file); Reader fileObject = null; try { fileObject = new FileReader(file); } catch (FileNotFoundException e) { if (stream != null) fileObject = new InputStreamReader(stream); } if (fileObject == null) { logger.error("Unable to resolve the source [{}]", file); throw new NotFoundException("Unable to resolve the source " + file); } return fileObject; } public static boolean validateArgs(String pathToBatch) throws NotFoundException, IOException { Reader reader = getReader(pathToBatch); if (reader != null) reader.close(); return true; } public Collection<String> resolveFiles(String source) throws IOException, NotFoundException { ArrayList<String> results = new ArrayList<>(); boolean absoluteFile = true; if (source.contains("*") || source.contains("?") || source.endsWith(File.separator)) absoluteFile = false; if (absoluteFile) { Reader reader; reader = getReader(source); if (reader != null) { reader.close(); results.add(source); return results; } } String filter; String path; Path toResolve = Paths.get(source); if (source.endsWith(File.separator)) filter = "*"; else filter = toResolve.getFileName().toString(); if (filter == null) filter = "*"; path = source.substring(0, source.lastIndexOf(File.separator) + 1); FileFilter fileFilter = new WildcardFileFilter(filter); // Split the source in to path and filter. //path = source.substring(0, source.indexOf()) File folder = new UrlResource("file:" + path).getFile(); File[] listOfFiles = folder.listFiles(fileFilter); if (listOfFiles == null) { folder = new ClassPathResource(path).getFile(); listOfFiles = folder.listFiles(fileFilter); } for (File file : listOfFiles) { results.add(file.toString()); } return results; } public int processFile(ExtractProfile extractProfile, String source) throws IllegalAccessException, InstantiationException, IOException, FlockException, ClassNotFoundException { //String source = path; logger.info("Start processing of {}", source); Collection<String> files = resolveFiles(source); int result = 0; try { for (String file : files) { if (extractProfile.getContentType() == ExtractProfile.ContentType.CSV) result = processCSVFile(file, extractProfile); else if (extractProfile.getContentType() == ExtractProfile.ContentType.XML) result = processXMLFile(file, extractProfile); else if (extractProfile.getContentType() == ExtractProfile.ContentType.JSON) { if (extractProfile.getContentModel().getDocumentType() == null) result = processJsonTags(file); else result = processJsonEntities(file, extractProfile); } } } finally { if (result > 0) { getPayloadWriter().flush(); } } logger.info("Processed {}", source); return result; } private int processJsonTags(String fileName) throws FlockException { Collection<TagInputBean> tags; ObjectMapper mapper = FdJsonObjectMapper.getObjectMapper(); int processed = 0; try { File file = new File(fileName); InputStream stream = null; if (!file.exists()) { // Try as a resource stream = ClassLoader.class.getResourceAsStream(fileName); if (stream == null) { logger.error("{} does not exist", fileName); throw new FlockException(fileName + " Does not exist"); } } TypeFactory typeFactory = mapper.getTypeFactory(); CollectionType collType = typeFactory.constructCollectionType(ArrayList.class, TagInputBean.class); if (file.exists()) tags = mapper.readValue(file, collType); else tags = mapper.readValue(stream, collType); for (TagInputBean tag : tags) { getPayloadWriter().writeTag(tag, "JSON Tag Importer"); processed++; } } catch (IOException e) { logger.error("Error writing exceptions with {} [{}]", fileName, e.getMessage()); throw new RuntimeException("IO Exception ", e); } finally { if (processed > 0L) getPayloadWriter().flush(); } return tags.size(); } private int processJsonEntities(String fileName, ExtractProfile extractProfile) throws FlockException { int rows = 0; File file = new File(fileName); InputStream stream = null; if (!file.exists()) { stream = ClassLoader.class.getResourceAsStream(fileName); if (stream == null) { logger.error("{} does not exist", fileName); return 0; } } StopWatch watch = new StopWatch(); JsonFactory jfactory = new JsonFactory(); JsonParser jParser; List<EntityToEntityLinkInput> referenceInputBeans = new ArrayList<>(); try { //String docType = mappable.getDataType(); watch.start(); ObjectMapper om = FdJsonObjectMapper.getObjectMapper(); try { if (stream != null) jParser = jfactory.createParser(stream); else jParser = jfactory.createParser(file); JsonToken currentToken = jParser.nextToken(); long then = new DateTime().getMillis(); JsonNode node; if (currentToken == JsonToken.START_ARRAY || currentToken == JsonToken.START_OBJECT) { while (currentToken != null && currentToken != JsonToken.END_OBJECT) { while (currentToken != null && jParser.nextToken() != JsonToken.END_ARRAY) { node = om.readTree(jParser); if (node != null) { processJsonNode(node, extractProfile.getContentModel(), referenceInputBeans); if (stopProcessing(rows++, then)) { break; } } currentToken = jParser.nextToken(); } } } else if (currentToken == JsonToken.START_OBJECT) { //om.readTree(jParser); node = om.readTree(jParser); processJsonNode(node, extractProfile.getContentModel(), referenceInputBeans); } } catch (IOException e1) { logger.error("Unexpected", e1); } } finally { getPayloadWriter().flush(); } return endProcess(watch, rows, 0); } private void processJsonNode(JsonNode node, ContentModel importProfile, List<EntityToEntityLinkInput> referenceInputBeans) throws FlockException { EntityInputBean entityInputBean = Transformer.toEntity(node, importProfile); if (!entityInputBean.getEntityLinks().isEmpty()) { referenceInputBeans.add(new EntityToEntityLinkInput(entityInputBean)); entityInputBean.getEntityLinks().size(); } getPayloadWriter().writeEntity(entityInputBean); } private int processXMLFile(String file, ExtractProfile extractProfile) throws IOException, FlockException, IllegalAccessException, InstantiationException, ClassNotFoundException { try { int rows = 0; StopWatch watch = new StopWatch(); StreamSource source = new StreamSource(file); XMLInputFactory xif = XMLInputFactory.newFactory(); XMLStreamReader xsr = xif.createXMLStreamReader(source); XmlMappable mappable = (XmlMappable) Class.forName(extractProfile.getHandler()).newInstance(); mappable.positionReader(xsr); String dataType = mappable.getDataType(); watch.start(); try { long then = new DateTime().getMillis(); while (xsr.getLocalName().equals(dataType)) { EntityInputBean entityInputBean = Transformer.toEntity(mappable, xsr, extractProfile.getContentModel()); rows++; xsr.nextTag(); getPayloadWriter().writeEntity(entityInputBean); if (stopProcessing(rows, then)) { break; } } } finally { getPayloadWriter().flush(); } return endProcess(watch, rows, 0); } catch (XMLStreamException | JAXBException e1) { throw new IOException(e1); } } private int processCSVFile(String file, ExtractProfile extractProfile) throws IOException, IllegalAccessException, InstantiationException, FlockException, ClassNotFoundException { StopWatch watch = new StopWatch(); int ignoreCount = 0; int currentRow = 0; BufferedReader br; Reader fileObject = getReader(file); br = new BufferedReader(fileObject); try { CSVReader csvReader; if (extractProfile.getQuoteCharacter() != null) csvReader = new CSVReader(br, extractProfile.getDelimiter(), extractProfile.getQuoteCharacter().charAt(0)); else csvReader = new CSVReader(br, extractProfile.getDelimiter()); String[] headerRow = null; String[] nextLine; if (extractProfile.hasHeader()) { while ((nextLine = csvReader.readNext()) != null) { if (isHeaderRow(nextLine)) { headerRow = nextLine; break; } } } watch.start(); if (skipCount > 0) logger.info("Skipping first {} rows", skipCount); long then = System.currentTimeMillis(); while ((nextLine = csvReader.readNext()) != null) { if (!ignoreRow(nextLine)) { if (headerRow == null) { headerRow = TransformationHelper.defaultHeader(nextLine, extractProfile.getContentModel()); } currentRow++; if (currentRow >= skipCount) { if (currentRow == skipCount) logger.info("Processing now begins at row {}", skipCount); nextLine = preProcess(nextLine, extractProfile); Map<String, Object> map = Transformer.convertToMap(headerRow, nextLine, extractProfile); if (map != null) { if (extractProfile.getContentModel().isTagModel()) { Collection<TagInputBean> tagInputBean = Transformer.toTags(map, extractProfile.getContentModel()); if (tagInputBean != null) { getPayloadWriter().writeTags(tagInputBean, "TagInputBean"); } } else { EntityInputBean entityInputBean = Transformer.toEntity(map, extractProfile.getContentModel()); // Dispatch/load mechanism if (entityInputBean != null) getPayloadWriter().writeEntity(entityInputBean); } if (stopProcessing(currentRow, then)) { break; } } } } else { ignoreCount++; } } } finally { getPayloadWriter().flush(); br.close(); } return endProcess(watch, currentRow, ignoreCount); } private boolean isHeaderRow(String[] nextLine) { if (nextLine.length > 0) { // do we have data? if (nextLine[0].length() > 0) // is there a value in the first char? return !(nextLine[0].startsWith("#") || nextLine[0].charAt(1) == '#'); // is it a comment? } return true; } public boolean stopProcessing(long currentRow) { return stopProcessing(currentRow, 0L); } private boolean stopProcessing(long currentRow, long then) { //DAT-350 if (rowsToProcess == 0) { if (currentRow % 1000 == 0) logger.info("Processed {} elapsed seconds {}", currentRow - skipCount, (new DateTime().getMillis() - then) / 1000d); return false; } boolean stop = currentRow >= skipCount + rowsToProcess; if (!stop && currentRow != skipCount && then > 0 && currentRow % 1000 == 0) logger.info("Processed {} elapsed seconds {}", currentRow - skipCount, (new DateTime().getMillis() - then) / 1000d); if (currentRow <= skipCount) return false; if (stop) logger.info("Process stopping after the {} requested rows.", rowsToProcess); return stop; } private boolean ignoreRow(String[] nextLine) { return nextLine[0].startsWith("#"); } public int endProcess(StopWatch watch, int rows, int ignoreCount) { watch.stop(); double mins = watch.getTotalTimeSeconds() / 60; long rowsProcessed = rows - skipCount; if (skipCount > 0) logger.info( "Completed [{}] rows in [{}] secs. rpm [{}]. Skipped first [{}] rows, finished on row {}, ignored [{}] rows", rowsProcessed, formatter.format(watch.getTotalTimeSeconds()), formatter.format(rowsProcessed / mins), skipCount, rows, ignoreCount); else logger.info("Completed [{}] rows in [{}] secs. rpm [{}] Finished on row [{}], ignored [{}] rows.", rowsProcessed, formatter.format(watch.getTotalTimeSeconds()), formatter.format(rowsProcessed / mins), rows, ignoreCount); return rows; } private PayloadWriter getPayloadWriter() { if (payloadWriter == null) { logger.error( "You are trying to use the FileProcessor but no FdBatcher has been configured for this service"); throw new RuntimeException("Attempted use of the FileProcessor with no FdBatcher"); } return payloadWriter; } }