org.wandora.application.tools.extractors.excel.AbstractExcelExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.wandora.application.tools.extractors.excel.AbstractExcelExtractor.java

Source

/*
 * WANDORA
 * Knowledge Extraction, Management, and Publishing Application
 * http://wandora.org
 * 
 * Copyright (C) 2004-2016 Wandora Team
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package org.wandora.application.tools.extractors.excel;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import javax.swing.Icon;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Comment;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.FormulaEvaluator;
import org.apache.poi.ss.usermodel.RichTextString;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.wandora.application.WandoraTool;
import static org.wandora.application.WandoraToolLogger.WAIT;
import org.wandora.application.WandoraToolType;
import org.wandora.application.gui.UIBox;
import org.wandora.application.tools.AbstractWandoraTool;
import org.wandora.application.tools.extractors.AbstractExtractor;
import static org.wandora.application.tools.extractors.AbstractExtractor.FILE_EXTRACTOR;
import static org.wandora.application.tools.extractors.AbstractExtractor.RAW_EXTRACTOR;
import static org.wandora.application.tools.extractors.AbstractExtractor.URL_EXTRACTOR;
import static org.wandora.application.tools.importers.graphs.AdjacencyListImport.SI_PREFIX;
import org.wandora.topicmap.Association;
import org.wandora.topicmap.Locator;
import org.wandora.topicmap.TMBox;
import org.wandora.topicmap.Topic;
import org.wandora.topicmap.TopicMap;
import org.wandora.topicmap.TopicMapException;
import org.wandora.topicmap.XTMPSI;

/**
 *
 * @author akivela
 */

public abstract class AbstractExcelExtractor extends AbstractExtractor implements WandoraTool {

    public static String DEFAULT_LANG = "en";

    @Override
    public String getName() {
        return "Abstract Excel extractor";
    }

    @Override
    public String getDescription() {
        return "Abstract Excel extractor.";
    }

    @Override
    public WandoraToolType getType() {
        return WandoraToolType.createExtractType();
    }

    @Override
    public Icon getIcon() {
        return UIBox.getIcon("gui/icons/extract_excel.png");
    }

    @Override
    public boolean runInOwnThread() {
        return true;
    }

    @Override
    public boolean useTempTopicMap() {
        return false;
    }

    @Override
    public boolean useURLCrawler() {
        return false;
    }

    @Override
    public int getExtractorType() {
        return FILE_EXTRACTOR | URL_EXTRACTOR;
    }

    // -------------------------------------------------------------------------

    @Override
    public boolean _extractTopicsFrom(File f, TopicMap topicMap) throws Exception {
        try {
            if (f != null) {
                String fn = f.getAbsolutePath();
                if (fn.toLowerCase().endsWith(".xls")) {
                    HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(f));
                    processWorkbook(workbook, topicMap);
                } else {
                    XSSFWorkbook workbook = new XSSFWorkbook(f.getAbsolutePath());
                    processWorkbook(workbook, topicMap);
                }
                log("Ok!");
            }
        } catch (FileNotFoundException ex) {
            log(ex);
        } catch (IOException ex) {
            log(ex);
        } catch (Exception ex) {
            log(ex);
        }
        setState(WAIT);
        return true;
    }

    @Override
    public boolean _extractTopicsFrom(URL u, TopicMap topicMap) throws Exception {
        try {
            HSSFWorkbook workbook = new HSSFWorkbook(u.openStream());
            processWorkbook(workbook, topicMap);
            log("Ok!");
        } catch (FileNotFoundException ex) {
            log(ex);
        } catch (IOException ex) {
            log(ex);
        } catch (Exception ex) {
            log(ex);
        }
        setState(WAIT);
        return true;
    }

    @Override
    public boolean _extractTopicsFrom(String str, TopicMap t) throws Exception {
        return false;
    }

    public abstract void processWorkbook(HSSFWorkbook workbook, TopicMap topicMap);

    public abstract void processWorkbook(XSSFWorkbook workbook, TopicMap topicMap);

    public abstract void processSheet(HSSFSheet sheet, TopicMap topicMap);

    public abstract void processSheet(XSSFSheet sheet, TopicMap topicMap);

    // -------------------------------------------------------------------------

    protected String getCellValueAsString(Cell cell) {
        if (cell != null) {
            if (cell.getCellType() == Cell.CELL_TYPE_FORMULA) {
                return getCellValueAsString(cell, cell.getCachedFormulaResultType());
            } else {
                return getCellValueAsString(cell, cell.getCellType());
            }
        }
        return null;
    }

    private DataFormatter formatter = new DataFormatter();
    private DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");

    protected String getCellValueAsString(Cell cell, int type) {
        if (cell != null) {
            switch (type) {
            case Cell.CELL_TYPE_ERROR: {
                return "ERROR" + cell.getErrorCellValue();
            }
            case Cell.CELL_TYPE_BOOLEAN: {
                return "" + cell.getBooleanCellValue();
            }
            case Cell.CELL_TYPE_NUMERIC: {
                if (DateUtil.isCellDateFormatted(cell)) {
                    return dateFormat.format(cell.getDateCellValue());
                } else {
                    double value = cell.getNumericCellValue();
                    String formatString = cell.getCellStyle().getDataFormatString();
                    int formatIndex = cell.getCellStyle().getDataFormat();
                    return formatter.formatRawCellContents(value, formatIndex, formatString);
                }
            }
            case Cell.CELL_TYPE_STRING: {
                return cell.getRichStringCellValue().getString();
            }
            }
        }
        return null;
    }

    // -------------------------------------------------------------------------

    public void associateToSheet(Cell cell, TopicMap tm) throws TopicMapException {
        if (cell.getSheet() != null) {
            Topic sheetTypeTopic = getSheetTypeTopic(tm);
            Topic sheetTopic = getSheetTopic(cell, tm);
            Topic cellTypeTopic = getCellTypeTopic(tm);
            Topic cellTopic = getCellTopic(cell, tm);

            if (sheetTypeTopic != null && sheetTopic != null && cellTypeTopic != null && cellTopic != null) {
                Association a = tm.createAssociation(sheetTypeTopic);
                a.addPlayer(cellTopic, cellTypeTopic);
                a.addPlayer(sheetTopic, sheetTypeTopic);
            }
        }
    }

    public void associateToLocation(Cell cell, TopicMap tm) throws TopicMapException {
        Topic locationTypeTopic = getCellLocationTypeTopic(tm);
        Topic rowTypeTopic = getRowTypeTopic(tm);
        Topic rowTopic = getRowTopic(cell, tm);
        Topic columnTypeTopic = getColumnTypeTopic(tm);
        Topic columnTopic = getColumnTopic(cell, tm);
        Topic cellTypeTopic = getCellTypeTopic(tm);
        Topic cellTopic = getCellTopic(cell, tm);

        if (locationTypeTopic != null && rowTypeTopic != null && rowTopic != null && columnTopic != null
                && columnTypeTopic != null && cellTypeTopic != null && cellTopic != null) {
            Association a = tm.createAssociation(locationTypeTopic);
            a.addPlayer(cellTopic, cellTypeTopic);
            a.addPlayer(rowTopic, rowTypeTopic);
            a.addPlayer(columnTopic, columnTypeTopic);
        }
    }

    public void associateToColors(Cell cell, TopicMap tm) throws TopicMapException {
        if (cell.getCellStyle() != null) {
            Topic colorTypeTopic = getBackgroundColorTypeTopic(tm);
            Topic colorTopic = getColorTopic(cell, tm);
            Topic cellTypeTopic = getCellTypeTopic(tm);
            Topic cellTopic = getCellTopic(cell, tm);

            if (colorTypeTopic != null && colorTopic != null && cellTypeTopic != null && cellTopic != null) {
                Association a = tm.createAssociation(colorTypeTopic);
                a.addPlayer(cellTopic, cellTypeTopic);
                a.addPlayer(colorTopic, colorTypeTopic);
            }

            colorTypeTopic = getForegroundColorTypeTopic(tm);
            colorTopic = getColorTopic(cell, tm);

            if (colorTypeTopic != null && colorTopic != null && cellTypeTopic != null && cellTopic != null) {
                Association a = tm.createAssociation(colorTypeTopic);
                a.addPlayer(cellTopic, cellTypeTopic);
                a.addPlayer(colorTopic, colorTypeTopic);
            }
        }
    }

    public void associateToType(Cell cell, TopicMap tm) throws TopicMapException {
        Topic typeTypeTopic = getCellTypeTypeTopic(tm);
        Topic typeTopic = getCellTypeTopic(cell, tm);
        Topic cellTypeTopic = getCellTypeTopic(tm);
        Topic cellTopic = getCellTopic(cell, tm);

        if (typeTypeTopic != null && typeTopic != null && cellTypeTopic != null && cellTopic != null) {
            Association a = tm.createAssociation(typeTypeTopic);
            a.addPlayer(cellTopic, cellTypeTopic);
            a.addPlayer(typeTopic, typeTypeTopic);
        }
    }

    public void associateToComment(Cell cell, TopicMap tm) throws TopicMapException {
        if (cell.getCellComment() != null) {
            Topic commentTypeTopic = getCommentTypeTopic(tm);
            Topic commentTopic = getCommentTopic(cell, tm);
            Topic cellTypeTopic = getCellTypeTopic(tm);
            Topic cellTopic = getCellTopic(cell, tm);

            if (commentTypeTopic != null && commentTopic != null && cellTypeTopic != null && cellTopic != null) {
                Association a = tm.createAssociation(commentTypeTopic);
                a.addPlayer(cellTopic, cellTypeTopic);
                a.addPlayer(commentTopic, commentTypeTopic);
            }
        }
    }

    public void associateToFormula(Cell cell, TopicMap tm) throws TopicMapException {
        if (cell.getCellType() == Cell.CELL_TYPE_FORMULA) {
            if (cell.getCellFormula() != null) {
                Topic formulaTypeTopic = getFormulaTypeTopic(tm);
                Topic formulaTopic = getFormulaTopic(cell, tm);
                Topic cellTypeTopic = getCellTypeTopic(tm);
                Topic cellTopic = getCellTopic(cell, tm);

                if (formulaTypeTopic != null && formulaTopic != null && cellTypeTopic != null
                        && cellTopic != null) {
                    Association a = tm.createAssociation(formulaTypeTopic);
                    a.addPlayer(cellTopic, cellTypeTopic);
                    a.addPlayer(formulaTopic, formulaTypeTopic);
                }
            }
        }
    }

    // -------------------------------------------------------------------------

    public static final int CELL_VALUE = 1;
    public static final int CELL_LOCATION = 2;
    public static final int CELL_SHEET_AND_LOCATION = 4;
    public static final int CELL_HASH = 8;

    public static int CELL_TOPIC_IS_BASED_ON = CELL_VALUE;

    public static String EXCEL_SI_PREFIX = "http://wandora.org/si/excel";
    public static String EXCEL_COLUMN_SI_PREFIX = EXCEL_SI_PREFIX + "/column";
    public static String EXCEL_ROW_SI_PREFIX = EXCEL_SI_PREFIX + "/row";
    public static String EXCEL_SHEET_SI_PREFIX = EXCEL_SI_PREFIX + "/sheet";
    public static String EXCEL_CELL_SI_PREFIX = EXCEL_SI_PREFIX + "/cell";
    public static String EXCEL_COLOR_SI_PREFIX = EXCEL_SI_PREFIX + "/color";
    public static String EXCEL_FORMULA_SI_PREFIX = EXCEL_SI_PREFIX + "/formula";
    public static String EXCEL_COMMENT_SI_PREFIX = EXCEL_SI_PREFIX + "/comment";
    public static String EXCEL_CELL_TYPE_SI_PREFIX = EXCEL_SI_PREFIX + "/cell-type";
    public static String EXCEL_CELL_LOCATION_SI_PREFIX = EXCEL_SI_PREFIX + "/cell-location";

    public static String DEFAULT_ASSOCIATION_TYPE_SI = EXCEL_SI_PREFIX + "/association-type";

    public static String DEFAULT_ROLE_TYPE_SI = EXCEL_SI_PREFIX + "/role";
    public static String DEFAULT_UPPER_ROLE_SI = DEFAULT_ROLE_TYPE_SI + "/upper";
    public static String DEFAULT_LOWER_ROLE_SI = DEFAULT_ROLE_TYPE_SI + "/lower";

    public static String DEFAULT_OCCURRENCE_TYPE_SI = EXCEL_SI_PREFIX + "/occurrence-type";

    public static String EXCEL_CELL_VALUE_SI = EXCEL_SI_PREFIX + "/cell-value";

    // -----

    public Topic getExcelTypeTopic(TopicMap tm) {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_SI_PREFIX, "Excel");
        try {
            typeTopic.addType(tm.getTopic(TMBox.WANDORACLASS_SI));
        } catch (Exception e) {
        }
        return typeTopic;
    }

    // -----

    public Topic getCellTopic(Cell cell, TopicMap tm) throws TopicMapException {
        String cellIdentifier = null;
        switch (CELL_TOPIC_IS_BASED_ON) {
        case CELL_VALUE: {
            cellIdentifier = getCellValueAsString(cell);
            break;
        }
        case CELL_SHEET_AND_LOCATION: {
            Sheet sheet = cell.getSheet();
            String sheetName = sheet.getSheetName();
            cellIdentifier = sheetName + "-" + cell.getColumnIndex() + "-" + cell.getRowIndex();
            break;
        }
        case CELL_LOCATION: {
            cellIdentifier = cell.getColumnIndex() + "-" + cell.getRowIndex();
            break;
        }
        case CELL_HASH: {
            cellIdentifier = Integer.toString(cell.hashCode());
            break;
        }
        }
        if (cellIdentifier != null) {
            String si = EXCEL_CELL_SI_PREFIX + "/" + urlEncode(cellIdentifier);
            Topic cellTopic = getOrCreateTopic(tm, si, cellIdentifier);
            cellTopic.addType(getCellTypeTopic(tm));
            return cellTopic;
        }
        return null;
    }

    public Topic getCellTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_CELL_SI_PREFIX, "Excel cell");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    public Topic getCellValueTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_CELL_VALUE_SI, "Excel cell value");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // -----

    public Topic getCellTypeTopic(Cell cell, TopicMap tm) throws TopicMapException {
        int type = cell.getCellType();
        String typeStr = "string";
        switch (type) {
        case Cell.CELL_TYPE_BLANK: {
            typeStr = "blank";
            break;
        }
        case Cell.CELL_TYPE_BOOLEAN: {
            typeStr = "boolean";
            break;
        }
        case Cell.CELL_TYPE_ERROR: {
            typeStr = "error";
            break;
        }
        case Cell.CELL_TYPE_FORMULA: {
            typeStr = "formula";
            break;
        }
        case Cell.CELL_TYPE_NUMERIC: {
            typeStr = "numeric";
            break;
        }
        case Cell.CELL_TYPE_STRING: {
            typeStr = "string";
            break;
        }
        }
        Topic t = getOrCreateTopic(tm, EXCEL_CELL_TYPE_SI_PREFIX + "/" + typeStr, "Excel cell type " + typeStr);
        t.addType(getCellTypeTypeTopic(tm));
        return t;
    }

    public Topic getCellTypeTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_CELL_TYPE_SI_PREFIX, "Excel cell type");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // -----

    public Topic getColorTopic(Cell cell, TopicMap tm) throws TopicMapException {
        CellStyle style = cell.getCellStyle();
        int color = style.getFillBackgroundColor();
        String si = EXCEL_COLOR_SI_PREFIX + "/" + urlEncode(Integer.toString(color));
        Topic topic = getOrCreateTopic(tm, si, "Color " + color);
        topic.addType(getColorTypeTopic(tm));
        return topic;
    }

    public Topic getColorTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_COLOR_SI_PREFIX, "Excel color");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    public Topic getBackgroundColorTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_COLOR_SI_PREFIX + "/background", "Excel background color");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    public Topic getForegroundColorTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_COLOR_SI_PREFIX + "/foreground", "Excel foreground color");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // -----

    public Topic getDefaultAssociationTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, DEFAULT_ASSOCIATION_TYPE_SI, "Excel association type");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    public Topic getDefaultUpperRoleTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, DEFAULT_UPPER_ROLE_SI, "Excel upper role");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    public Topic getDefaultLowerRoleTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, DEFAULT_LOWER_ROLE_SI, "Excel lower role");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // -----

    public Topic getRowTopic(Cell cell, TopicMap tm) throws TopicMapException {
        Topic topic = getOrCreateTopic(tm,
                EXCEL_ROW_SI_PREFIX + "/" + urlEncode(Integer.toString(cell.getRowIndex())),
                "Excel row " + cell.getRowIndex());
        topic.addType(getRowTypeTopic(tm));
        return topic;
    }

    public Topic getRowTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_ROW_SI_PREFIX, "Excel row");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // -----

    public Topic getColumnTopic(Cell cell, TopicMap tm) throws TopicMapException {
        Topic topic = getOrCreateTopic(tm,
                EXCEL_COLUMN_SI_PREFIX + "/" + urlEncode(Integer.toString(cell.getColumnIndex())),
                "Excel column " + cell.getColumnIndex());
        topic.addType(getColumnTypeTopic(tm));
        return topic;
    }

    public Topic getColumnTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_COLUMN_SI_PREFIX, "Excel column");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // -----

    public Topic getCommentTopic(Cell cell, TopicMap tm) throws TopicMapException {
        Comment comment = cell.getCellComment();
        if (comment != null) {
            RichTextString rts = comment.getString();
            String str = rts.getString();
            String basename = str.replace('\n', ' ');
            basename = basename.replace('\r', ' ');
            basename = basename.replace('\t', ' ');
            Topic topic = getOrCreateTopic(tm, EXCEL_COMMENT_SI_PREFIX + "/" + urlEncode(basename), basename);
            topic.setData(getCommentTypeTopic(tm), tm.getTopic(XTMPSI.getLang(DEFAULT_LANG)), str);
            topic.addType(getCommentTypeTopic(tm));
            return topic;
        }
        return null;
    }

    public Topic getCommentTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_COMMENT_SI_PREFIX, "Excel comment");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // ------

    public Topic getFormulaTopic(Cell cell, TopicMap tm) throws TopicMapException {
        String formula = cell.getCellFormula();
        if (formula != null) {
            Topic topic = getOrCreateTopic(tm, EXCEL_FORMULA_SI_PREFIX + "/" + urlEncode(formula), formula);
            topic.setData(getFormulaTypeTopic(tm), tm.getTopic(XTMPSI.getLang(DEFAULT_LANG)), formula);
            topic.addType(getFormulaTypeTopic(tm));
            return topic;
        }
        return null;
    }

    public Topic getFormulaTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_FORMULA_SI_PREFIX, "Excel formula");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // ------

    public Topic getCellLocationTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_CELL_LOCATION_SI_PREFIX, "Excel cell location");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // ------

    public Topic getSheetTopic(Cell cell, TopicMap tm) throws TopicMapException {
        Sheet sheet = cell.getSheet();
        if (sheet != null) {
            String sheetName = sheet.getSheetName();
            Topic topic = getOrCreateTopic(tm, EXCEL_SHEET_SI_PREFIX + "/" + urlEncode(sheetName), sheetName);
            topic.addType(getSheetTypeTopic(tm));
            return topic;
        }
        return null;
    }

    public Topic getSheetTypeTopic(TopicMap tm) throws TopicMapException {
        Topic typeTopic = getOrCreateTopic(tm, EXCEL_SHEET_SI_PREFIX, "Excel sheet");
        typeTopic.addType(getExcelTypeTopic(tm));
        return typeTopic;
    }

    // -------------------------------------------------------------------------

    public Topic getOrCreateTopic(TopicMap map, String si, String basename) {
        Topic topic = null;
        try {
            topic = map.getTopic(si);
            if (topic == null) {
                topic = map.createTopic();
                topic.addSubjectIdentifier(new Locator(si));
                if (basename != null && basename.length() > 0)
                    topic.setBaseName(basename);
            }
        } catch (Exception e) {
            log(e);
            e.printStackTrace();
        }
        return topic;
    }

}