rapture.dp.invocable.workflow.ProcessFile.java Source code

Java tutorial

Introduction

Here is the source code for rapture.dp.invocable.workflow.ProcessFile.java

Source

/**
 * The MIT License (MIT)
 *
 * Copyright (c) 2011-2016 Incapture Technologies LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package rapture.dp.invocable.workflow;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import com.google.common.collect.ImmutableMap;

import rapture.common.CallingContext;
import rapture.common.RaptureFolderInfo;
import rapture.common.exception.RaptureException;
import rapture.common.impl.jackson.JacksonUtil;
import rapture.common.util.InsertData;
import rapture.dp.AbstractStep;
import rapture.kernel.Kernel;

public class ProcessFile extends AbstractStep {

    public ProcessFile(String workerUri, String stepName) {
        super(workerUri, stepName);
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Override
    public String invoke(CallingContext ctx) {
        final int BATCH_LOAD_SIZE = 50; // TODO: move to config
        OPCPackage pkg;
        XSSFWorkbook wb;
        List uris = new ArrayList<>();
        // stores all documents for insertion
        List<List<String>> allDocs = new ArrayList<List<String>>();

        String file = Kernel.getDecision().getContextValue(ctx, getWorkerURI(), "filetoupload");
        String blobUri = Kernel.getDecision().getContextValue(ctx, getWorkerURI(), "blobUri");
        String folderName = Kernel.getDecision().getContextValue(ctx, getWorkerURI(), "folderName");

        String repo = "document://data/" + folderName;
        String docUri = repo + "#id";

        try {
            InputStream is = new ByteArrayInputStream(Kernel.getBlob().getBlob(ctx, blobUri).getContent());
            pkg = OPCPackage.open(is);
            wb = new XSSFWorkbook(pkg);
            XSSFSheet sheet = wb.getSheetAt(0);

            log.info("Loading " + sheet.getPhysicalNumberOfRows() + " rows from " + file + ". Batch size is "
                    + BATCH_LOAD_SIZE);

            int physicalNumberOfRows = sheet.getPhysicalNumberOfRows();
            int remainder = physicalNumberOfRows % BATCH_LOAD_SIZE;
            int div = physicalNumberOfRows / BATCH_LOAD_SIZE;

            // this only needs to be done once as the uris dont change
            for (int g = 1; g <= BATCH_LOAD_SIZE; g++) {
                uris.add(docUri);
            }
            log.info("created uris list " + uris.size());

            int j = 0;
            int count = 0;
            long startLoadTime = System.currentTimeMillis();
            for (int i = 1; i <= div; i++) {
                List docs = new ArrayList<>();
                // Create a list of documents with size of BATCH_LOAD_SIZE
                for (j = count; j < (BATCH_LOAD_SIZE * i); j++) {
                    Row row = sheet.getRow(j);
                    Map<String, Object> map = ImmutableMap.of("Row", row.getRowNum(), "DataPeriod",
                            row.getCell(0).toString(), "Industry", row.getCell(3).toString(), "Price",
                            row.getCell(7).toString());
                    docs.add(JacksonUtil.jsonFromObject(map));
                }
                allDocs.add(docs);
                count = j;
            }
            long endLoadTime = System.currentTimeMillis();

            ExecutorService executorService = Executors.newCachedThreadPool();
            long startWriteTime = System.currentTimeMillis();
            for (List<String> docList : allDocs) {
                executorService.execute(new InsertData(ctx, docList, uris));
            }
            executorService.shutdown();

            try {
                // TODO: hardcoded timeout.ComparableFutures?
                // Helpful:
                // http://stackoverflow.com/questions/1250643/how-to-wait-for-all-threads-to-finish-using-executorservice
                executorService.awaitTermination(60000L, TimeUnit.MILLISECONDS);
            } catch (InterruptedException e) {
                log.error(e.getStackTrace().toString(), e);
                return "error";
            }
            long endWriteTime = System.currentTimeMillis();
            log.info("Completed parallel load.");

            // handle the remaining rows
            if (remainder > 0) {
                long remStartTime = System.currentTimeMillis();
                for (int k = (count); k < (count + remainder); k++) {
                    Row row = sheet.getRow(k);
                    Map<String, Object> map = ImmutableMap.of("Row", row.getRowNum(), "DataPeriod",
                            row.getCell(0).toString(), "Industry", row.getCell(3).toString(), "Price",
                            row.getCell(7).toString());
                    Kernel.getDoc().putDoc(ctx, docUri, JacksonUtil.jsonFromObject(map));
                }
                long remEndTime = System.currentTimeMillis();
                log.info("Remainders took " + (remEndTime - remStartTime) + "ms");
            }

            log.info("Populated uri " + repo + ". Took " + (endLoadTime - startLoadTime) + "ms. to load data. Took "
                    + (endWriteTime - startWriteTime) + "ms. to write data.");
            pkg.close();

            Map<String, RaptureFolderInfo> listDocsByUriPrefix = Kernel.getDoc().listDocsByUriPrefix(ctx, repo, 1);
            log.info("Count from repo is " + listDocsByUriPrefix.size());

            if (listDocsByUriPrefix.size() == sheet.getPhysicalNumberOfRows()) {
                return "ok";
            } else {
                return "error"; // TODO: add error step
            }
        } catch (InvalidFormatException | IOException | RaptureException e) {
            log.error("ProcessFile error", e);
            return "error";
        }
    }
}