gov.nih.nci.evs.app.neopl.ExcelToCSV.java Source code

Java tutorial

Introduction

Here is the source code for gov.nih.nci.evs.app.neopl.ExcelToCSV.java

Source

package gov.nih.nci.evs.app.neopl;

import com.opencsv.CSVReader;
//import gov.nih.nci.evs.browser.utils.*;
import java.io.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.FormulaEvaluator;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;

/**
 * <!-- LICENSE_TEXT_START -->
 * Copyright 2008-2016 NGIS. This software was developed in conjunction
 * with the National Cancer Institute, and so to the extent government
 * employees are co-authors, any rights in such works shall be subject
 * to Title 17 of the United States Code, section 105.
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the disclaimer of Article 3,
 *      below. Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *   2. The end-user documentation included with the redistribution,
 *      if any, must include the following acknowledgment:
 *      "This product includes software developed by NGIS and the National
 *      Cancer Institute."   If no such end-user documentation is to be
 *      included, this acknowledgment shall appear in the software itself,
 *      wherever such third-party acknowledgments normally appear.
 *   3. The names "The National Cancer Institute", "NCI" and "NGIS" must
 *      not be used to endorse or promote products derived from this software.
 *   4. This license does not authorize the incorporation of this software
 *      into any third party proprietary programs. This license does not
 *      authorize the recipient to use any trademarks owned by either NCI
 *      or NGIS
 *   5. THIS SOFTWARE IS PROVIDED "AS IS," AND ANY EXPRESSED OR IMPLIED
 *      WARRANTIES, (INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *      OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE) ARE
 *      DISCLAIMED. IN NO EVENT SHALL THE NATIONAL CANCER INSTITUTE,
 *      NGIS, OR THEIR AFFILIATES BE LIABLE FOR ANY DIRECT, INDIRECT,
 *      INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *      BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *      LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *      CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *      LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *      ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *      POSSIBILITY OF SUCH DAMAGE.
 * <!-- LICENSE_TEXT_END -->
 */

/**
 * @author EVS Team
 * @version 1.0
 *
 * Modification history:
 *     Initial implementation kim.ong@ngc.com
 *
 */

public class ExcelToCSV {
    private Workbook workbook = null;
    private ArrayList<ArrayList<String>> csvData = null;
    private int maxRowWidth = 0;
    private int formattingConvention = 0;
    private DataFormatter formatter = null;
    private FormulaEvaluator evaluator = null;
    private String separator = null;

    private static final String CSV_FILE_EXTENSION = ".csv";
    private static final String DEFAULT_SEPARATOR = ",";
    public static final int EXCEL_STYLE_ESCAPING = 0;

    public static final int UNIX_STYLE_ESCAPING = 1;

    private String header = null;

    public void setHeader(String header) {
        this.header = header;
    }

    public void convertExcelToCSV(String strSource, String strDestination)
            throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException {
        this.convertExcelToCSV(strSource, strDestination, DEFAULT_SEPARATOR, EXCEL_STYLE_ESCAPING);
    }

    public void convertExcelToCSV(String strSource, String strDestination, String separator)
            throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException {

        this.convertExcelToCSV(strSource, strDestination, separator, EXCEL_STYLE_ESCAPING);
    }

    public ArrayList readExcelFile(String strSource)
            throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException {
        return readExcelFile(strSource, DEFAULT_SEPARATOR, EXCEL_STYLE_ESCAPING);
    }

    public ArrayList readExcelFile(String strSource, String separator, int formattingConvention)
            throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException {

        File source = new File(strSource);
        File[] filesList = null;

        if (!source.exists()) {
            throw new IllegalArgumentException("The source for the Excel " + "file(s) cannot be found.");
        }
        if (formattingConvention != EXCEL_STYLE_ESCAPING && formattingConvention != UNIX_STYLE_ESCAPING) {
            throw new IllegalArgumentException(
                    "The value passed to the " + "formattingConvention parameter is out of range.");
        }
        this.separator = separator;
        this.formattingConvention = formattingConvention;

        File excelFile = source;
        this.openWorkbook(excelFile);
        this.convertToCSV();
        return this.csvData;
    }

    public void convertExcelToCSV(String strSource, String strDestination, String separator,
            int formattingConvention)
            throws FileNotFoundException, IOException, IllegalArgumentException, InvalidFormatException {

        File source = new File(strSource);
        File destination = new File(strDestination);
        File[] filesList = null;
        String destinationFilename = null;

        if (!source.exists()) {
            throw new IllegalArgumentException("The source for the Excel " + "file(s) cannot be found.");
        }
        if (formattingConvention != EXCEL_STYLE_ESCAPING && formattingConvention != UNIX_STYLE_ESCAPING) {
            throw new IllegalArgumentException(
                    "The value passed to the " + "formattingConvention parameter is out of range.");
        }
        this.separator = separator;
        this.formattingConvention = formattingConvention;

        File excelFile = source;
        this.openWorkbook(excelFile);
        this.convertToCSV();

        destinationFilename = excelFile.getName();
        int n = destinationFilename.lastIndexOf(".");
        destinationFilename = destinationFilename.substring(0, n) + CSV_FILE_EXTENSION;
        System.out.println("(*) destinationFilename: " + destinationFilename);
        this.saveCSVFile(new File(destinationFilename));
    }

    private void openWorkbook(File file) throws FileNotFoundException, IOException, InvalidFormatException {
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(file);
            this.workbook = WorkbookFactory.create(fis);
            this.evaluator = this.workbook.getCreationHelper().createFormulaEvaluator();
            this.formatter = new DataFormatter(true);
        } finally {
            if (fis != null) {
                fis.close();
            }
        }
    }

    private void convertToCSV() {
        Sheet sheet = null;
        Row row = null;
        int lastRowNum = 0;
        this.csvData = new ArrayList<ArrayList<String>>();

        System.out.println("Converting files contents to CSV format.");

        int numSheets = this.workbook.getNumberOfSheets();
        for (int i = 0; i < numSheets; i++) {
            sheet = this.workbook.getSheetAt(i);
            if (sheet.getPhysicalNumberOfRows() > 0) {
                lastRowNum = sheet.getLastRowNum();
                /*
                for(int j = 0; j <= lastRowNum; j++) {
                row = sheet.getRow(j);
                this.rowToCSV(row);
                }
                */
                for (int j = 1; j <= lastRowNum; j++) {
                    row = sheet.getRow(j);
                    this.rowToCSV(row);
                }
            }
            //sheet.createFreezePane(0,1); // this will freeze the header row
        }

    }

    private void saveCSVFile(File file) throws FileNotFoundException, IOException {
        FileWriter fw = null;
        BufferedWriter bw = null;
        ArrayList<String> line = null;
        StringBuffer buffer = null;
        String csvLineElement = null;
        try {
            fw = new FileWriter(file);
            bw = new BufferedWriter(fw);

            if (header != null) {
                bw.write(header);
                bw.newLine();
            }
            for (int i = 0; i < this.csvData.size(); i++) {
                buffer = new StringBuffer();
                line = this.csvData.get(i);
                for (int j = 0; j < this.maxRowWidth; j++) {
                    if (line.size() > j) {
                        csvLineElement = line.get(j);
                        if (csvLineElement != null) {
                            buffer.append(this.escapeEmbeddedCharacters(csvLineElement));
                        }
                    }
                    if (j < (this.maxRowWidth - 1)) {
                        buffer.append(this.separator);
                    }
                }
                bw.write(buffer.toString().trim());
                if (i < (this.csvData.size() - 1)) {
                    bw.newLine();
                }
            }
        } finally {
            if (bw != null) {
                bw.flush();
                bw.close();
            }
        }
    }

    private void rowToCSV(Row row) {
        Cell cell = null;
        int lastCellNum = 0;
        ArrayList<String> csvLine = new ArrayList<String>();
        if (row != null) {
            lastCellNum = row.getLastCellNum();
            for (int i = 0; i <= lastCellNum; i++) {
                cell = row.getCell(i);
                if (cell == null) {
                    csvLine.add("");
                } else {
                    csvLine.add(this.formatter.formatCellValue(cell));
                }
            }
            if (lastCellNum > this.maxRowWidth) {
                this.maxRowWidth = lastCellNum;
            }
        }
        this.csvData.add(csvLine);
    }

    private String escapeEmbeddedCharacters(String field) {
        StringBuffer buffer = null;
        if (this.formattingConvention == EXCEL_STYLE_ESCAPING) {
            if (field.contains("\"")) {
                buffer = new StringBuffer(field.replaceAll("\"", "\\\"\\\""));
                buffer.insert(0, "\"");
                buffer.append("\"");
            } else {
                buffer = new StringBuffer(field);
                if ((buffer.indexOf(this.separator)) > -1 || (buffer.indexOf("\n")) > -1) {
                    buffer.insert(0, "\"");
                    buffer.append("\"");
                }
            }
            return (buffer.toString().trim());
        } else {
            if (field.contains(this.separator)) {
                field = field.replaceAll(this.separator, ("\\\\" + this.separator));
            }
            if (field.contains("\n")) {
                field = field.replaceAll("\n", "\\\\\n");
            }
            return (field);
        }
    }

    class ExcelFilenameFilter implements FilenameFilter {
        public boolean accept(File file, String name) {
            return (name.endsWith(".xls") || name.endsWith(".xlsx"));
        }
    }

    public static String getOutputFile(String inputfile, String format) {
        int n = inputfile.lastIndexOf(".");
        String outputfile = inputfile.substring(0, n) + "." + format;
        return outputfile;
    }

    public static Vector<String> parseData(String line) {
        if (line == null)
            return null;
        String tab = "|";
        return parseData(line, tab);
    }

    public static Vector<String> parseData(String line, String tab) {
        if (line == null)
            return null;
        Vector data_vec = new Vector();
        StringTokenizer st = new StringTokenizer(line, tab);
        while (st.hasMoreTokens()) {
            String value = st.nextToken();
            if (value.compareTo("null") == 0)
                value = "";
            data_vec.add(value);
        }
        return data_vec;
    }

    public static Vector extractColumnDataFromCSV(String csvfile, int column_index) {
        boolean skip_heading = true;
        return extractColumnDataFromCSV(csvfile, column_index, skip_heading);
    }

    public static Vector extractColumnDataFromCSV(String csvfile, int column_index, boolean skip_heading) {
        Vector w = new Vector();
        String[] values = null;
        try {
            CSVReader reader = new CSVReader(new FileReader(csvfile));
            if (skip_heading) {
                values = reader.readNext();
            }
            while ((values = reader.readNext()) != null) {
                String value = (String) values[column_index];
                w.add(value);
            }
        } catch (Exception ex) {
            ex.printStackTrace();
            return null;
        }
        return w;
    }

    public void toDelimitedFile(String excelfile, String outputfile, String heading) {
        PrintWriter pw = null;

        FileWriter fw = null;
        BufferedWriter bw = null;
        try {
            //fw = new FileWriter(new File(outputfile));
            bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8"));

            //bw = new BufferedWriter(fw);

            List list = readExcelFile(excelfile);
            if (heading != null && heading.length() > 0) {
                bw.write(heading);
                bw.newLine();
            }

            for (int i = 0; i < list.size(); i++) {
                ArrayList<String> a = (ArrayList<String>) list.get(i);
                StringBuffer buf = new StringBuffer();
                for (int j = 0; j < a.size(); j++) {
                    String t = (String) a.get(j);
                    buf.append(t).append("\t");
                }
                String line = buf.toString();
                bw.write(line);
                bw.newLine();

                //pw.println(line);
            }

        } catch (Exception ex) {
            ex.printStackTrace();

        } finally {
            try {
                //if (fin != null) fin.close();
                //if (fout != null) fout.close();
                //pw.close();
                bw.close();
                System.out.println("Output file " + outputfile + " generated.");
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }
    }

    public static List readCSV(String csvfile, boolean skip_heading) {
        ArrayList list = new ArrayList();
        int n = csvfile.lastIndexOf(".");
        String nextLine = null;
        //String outputfile = getOutputFile(csvfile, "txt");

        Vector w = new Vector();
        String[] values = null;
        BufferedReader in = null;
        try {
            //CSVReader reader = new CSVReader(new FileReader(csvfile));
            in = new BufferedReader(new InputStreamReader(new FileInputStream(csvfile), "UTF-8"));

            CSVReader reader = new CSVReader(in);

            if (skip_heading) {
                values = reader.readNext();
            }
            while ((values = reader.readNext()) != null) {
                list.add(values);
            }

        } catch (Exception ex) {
            ex.printStackTrace();
            return null;

        } finally {
            try {
                if (in != null)
                    in.close();
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
        return list;
    }

    /*
        public static void main(String[] args) {
    ExcelToCSV converter = null;
    boolean converted = true;
    long startTime = System.currentTimeMillis();
    try {
        converter = new ExcelToCSV();
        if(args.length == 2) {
            converter.convertExcelToCSV(args[0], args[1]);
        }
        else if(args.length == 3){
            converter.convertExcelToCSV(args[0], args[1], args[2]);
        }
        else if(args.length == 4) {
            converter.convertExcelToCSV(args[0], args[1],
                                        args[2], Integer.parseInt(args[3]));
        }
        else {
            System.out.println("Usage: java ToCSV [Source File/Folder] " +
                "[Destination Folder] [Separator] [Formatting Convention]\n" +
                "\tSource File/Folder\tThis argument should contain the name of and\n" +
                "\t\t\t\tpath to either a single Excel workbook or a\n" +
                "\t\t\t\tfolder containing one or more Excel workbooks.\n" +
                "\tDestination Folder\tThe name of and path to the folder that the\n" +
                "\t\t\t\tCSV files should be written out into. The\n" +
                "\t\t\t\tfolder must exist before running the ToCSV\n" +
                "\t\t\t\tcode as it will not check for or create it.\n" +
                "\tSeparator\t\tOptional. The character or characters that\n" +
                "\t\t\t\tshould be used to separate fields in the CSV\n" +
                "\t\t\t\trecord. If no value is passed then the comma\n" +
                "\t\t\t\twill be assumed.\n" +
                "\tFormatting Convention\tOptional. This argument can take one of two\n" +
                "\t\t\t\tvalues. Passing 0 (zero) will result in a CSV\n" +
                "\t\t\t\tfile that obeys Excel's formatting conventions\n" +
                "\t\t\t\twhilst passing 1 (one) will result in a file\n" +
                "\t\t\t\tthat obeys UNIX formatting conventions. If no\n" +
                "\t\t\t\tvalue is passed, then the CSV file produced\n" +
                "\t\t\t\twill obey Excel's formatting conventions.");
            converted = false;
        }
    }
    catch(Exception ex) {
        System.out.println("Caught an: " + ex.getClass().getName());
        System.out.println("Message: " + ex.getMessage());
        System.out.println("Stacktrace follows:.....");
        ex.printStackTrace(System.out);
        converted = false;
    }
        
    if (converted) {
        System.out.println("Conversion took " +
              (int)((System.currentTimeMillis() - startTime)/1000) + " seconds");
    }
        }
    */

    public static void main(String[] args) {
        String excelfile = args[0];
        String outputfile = null;
        int n = excelfile.lastIndexOf(".");
        outputfile = excelfile.substring(0, n) + ".txt";
        System.out.println(excelfile);
        System.out.println(outputfile);
        String heading = "Code\tPreferred Term\tSynonyms\tDefinition\tNeoplastic Status";
        new ExcelToCSV().toDelimitedFile(excelfile, outputfile, heading);

    }

}