com.globalsight.terminology.importer.ExcelReaderThread.java Source code

Introduction

Here is the source code for com.globalsight.terminology.importer.ExcelReaderThread.java
Source

/**
 *  Copyright 2009 Welocalize, Inc. 
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  
 *  You may obtain a copy of the License at 
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  
 */

package com.globalsight.terminology.importer;

import org.apache.log4j.Logger;

import com.globalsight.terminology.importer.ImportUtil;
import com.globalsight.terminology.importer.ImportOptions.ColumnDescriptor;
import com.globalsight.terminology.Termbase;
import com.globalsight.terminology.TermbaseException;
import com.globalsight.terminology.Entry;
import com.globalsight.importer.ImportOptions;
import com.globalsight.util.ReaderResult;
import com.globalsight.util.ReaderResultQueue;
import org.dom4j.*;
import java.util.*;
import java.io.*;

import jxl.*;

/**
 * Reads Excel files and produces Entry objects by putting ReaderResult
 * objects into a ReaderResultQueue.
 */
public class ExcelReaderThread extends Thread {
    private static final Logger CATEGORY = Logger.getLogger(ExcelReaderThread.class);

    private ReaderResultQueue m_results;
    private ReaderResult m_result = null;
    private ImportOptions m_options;
    private Termbase m_termbase;

    private DocumentFactory m_factory = DocumentFactory.getInstance();

    private static final int MAX_EMPTY_ROW = 5;

    //
    // Constructor
    //
    public ExcelReaderThread(ReaderResultQueue p_q, ImportOptions p_options, Termbase p_termbase) {
        m_results = p_q;
        m_options = p_options;
        m_termbase = p_termbase;
    }

    //
    // Thread methods
    //
    public void run() {
        try {
            com.globalsight.terminology.importer.ImportOptions options = (com.globalsight.terminology.importer.ImportOptions) m_options;
            if (options.getColumnHeaderRow() == null) {
                return;
            }
            String url = options.getFileName();
            InputStream is = new FileInputStream(url);
            WorkbookSettings setting = new WorkbookSettings();
            setting.setEncoding("ISO-8859-1");
            Workbook readBook = Workbook.getWorkbook(is, setting);
            Sheet[] sheets = readBook.getSheets();
            Sheet readSheet = null;
            Cell[] cells = null;
            for (int i = 0; i < sheets.length; i++) {
                readSheet = sheets[i];
                int rowsInSheet = readSheet.getRows();
                int headerRowNumber = options.getColumnHeaderRow(i);
                if (headerRowNumber < 0) {
                    continue;
                }

                List columnDescriptors = options.getColumnDescriptors(i);
                int emptyRow = 0;

                for (int j = headerRowNumber + 1; j < rowsInSheet; j++) {
                    m_result = m_results.hireResult();
                    cells = readSheet.getRow(j);
                    if (ImportUtil.isEmptyOrIllegalRow(cells, columnDescriptors)) {
                        // m_result.setError("sheet " + (i + 1) + " row " + (j +
                        // 1) +
                        // ": should at least contain two term columns; ignoring
                        // row");
                        CATEGORY.info("sheet " + (i + 1) + " row " + (j + 1)
                                + ": should at least contain two term columns; ignoring row");

                        emptyRow++;

                        if (emptyRow > MAX_EMPTY_ROW) {
                            break;
                        }

                        continue;
                    }

                    emptyRow = 0;

                    Document dom = buildEntry(cells, columnDescriptors);
                    Entry entry = new Entry(dom);

                    m_result.setResultObject(entry);

                    if (CATEGORY.isDebugEnabled()) {
                        CATEGORY.debug(entry.getXml());
                    }

                    boolean done = m_results.put(m_result);
                    m_result = null;

                    // Stop reading file.
                    if (done) {
                        throw new ThreadDeath();
                    }

                }
            }
        } catch (ThreadDeath ignore) {
            CATEGORY.info("ReaderThread: interrupted");
        } catch (Throwable ex) {
            // Should never happen, and I don't know how to handle
            // this case other than passing the exception in
            // m_results, which I won't do for now.
            CATEGORY.error("unexpected error", ex);
        } finally {
            if (m_result != null) {
                m_results.fireResult(m_result);
                m_result = null;
            }

            m_results.producerDone();
            m_results = null;

            CATEGORY.debug("ReaderThread: done");
        }
    }

    /**
     * <p>Converts a list of column values into an Entry.</p>
     *
     * <p>Algorithmic note: columns are managed in a priority queue.
     * We retrieve the first column and try to map it to the entry.
     * If we can't because the column depends on a later column
     * (associatedColumn field), we add it to the end of the queue.
     * Rinse and repeat until the queue is empty.</p>
     */
    private Document buildEntry(Cell[] p_cells, List p_colDescriptors) throws TermbaseException, Exception {
        Document result = m_factory.createDocument();
        Element root = result.addElement("conceptGrp");
        ArrayList columnDes = new ArrayList(p_colDescriptors);

        // Array that holds the elements as we construct them.
        Element[] nodes = new Element[p_cells.length];
        Element node;

        // An entry must have at least one term to be stored; check this.
        int termCount = 0;

        int index;
        String value;

        for (int i = 0; i < columnDes.size(); i++) {
            ColumnDescriptor col = (ColumnDescriptor) columnDes.get(i);

            index = col.m_position;
            if (index >= p_cells.length) {
                continue;
            }

            value = p_cells[index].getContents().trim();

            if (col.m_type.equals("term")) {
                // build the node and add it to the array so later
                // nodes associated with it can find it (prevent
                // endless loop)
                node = buildTermGrp(value);
                nodes[index] = node;

                // only add termGrp to entry if term is non-empty
                if (value.length() > 0) {
                    ++termCount;
                    addTermGrp(root, node, col.m_termLanguage);
                } else {
                    continue;
                }
            } else if (col.m_type.equals("skip")) {
                continue;
            } else if (col.m_type.equals("source")) {
                int ac = Integer.parseInt(col.m_associatedColumn);
                if (ac < 0) {
                    continue;
                }

                Element otherNode = nodes[ac];
                if (otherNode == null) {
                    if (!isSkippedColumn(ac)) {
                        // The column to which this column belongs has
                        // not been processed yet, but it will be
                        // processed eventually.  Put this column at
                        // the end of the queue to be processed later.
                        columnDes.add(col);
                    }

                    continue;
                }

                // Now we know which column to associate this column with.
                //String type = col.m_type.substring("term".length());

                node = buildSourceGrp(value);
                nodes[index] = node;

                addSourceGrp(otherNode, node);
            }
            // All concept-related attributes
            else if (col.m_type.startsWith("concept")) {
                String type = col.m_type.substring("concept".length());

                node = buildDescripGrp(value, type);
                nodes[index] = node;

                addConceptDescripGrp(root, node);
            }
            // All term-related attributes
            else if (col.m_type.startsWith("term")) {
                int ac = Integer.parseInt(col.m_associatedColumn);
                if (ac < 0) {
                    continue;
                }

                Element termNode = nodes[ac];
                if (termNode == null) {
                    if (!isSkippedColumn(ac)) {
                        // Term column to which this column belongs
                        // has not been processed yet, but it will be
                        // processed eventually. Put this column at
                        // the end of the list to be processed later.
                        columnDes.add(col);
                    }

                    continue;
                }

                // Now we know which column to associate this column with.
                String type = col.m_type.substring("term".length());

                node = buildDescripGrp(value, type);
                nodes[index] = node;

                addTermDescripGrp(termNode, node);
            } else {
                throw new Exception("invalid column descriptor " + col.m_type);
            }
        }

        if (termCount == 0) {
            throw new Exception("no terms found, ignoring entry");
        }

        return result;
    }

    private boolean isSkippedColumn(int index) {
        ColumnDescriptor col = (ColumnDescriptor) ((com.globalsight.terminology.importer.ImportOptions) m_options)
                .getColumns().get(index);

        return col.m_type.equals("skip");
    }

    private void addTermGrp(Element p_root, Element p_term, String p_language) throws TermbaseException {
        Element langGrp = (Element) p_root.selectSingleNode("//languageGrp[language/@name='" + p_language + "']");

        if (langGrp == null) {
            langGrp = buildLanguageGrp(p_language);
            p_root.add(langGrp);
        }

        langGrp.add(p_term);
    }

    private void addConceptDescripGrp(Element p_root, Element p_descrip) {
        p_root.add(p_descrip);
    }

    private void addTermDescripGrp(Element p_node, Element p_descrip) {
        p_node.add(p_descrip);
    }

    private void addSourceGrp(Element p_node, Element p_source) {
        p_node.add(p_source);
    }

    private Element buildTermGrp(String p_value) {
        Element grp = m_factory.createElement("termGrp");
        Element node = grp.addElement("term").addText(p_value);
        return grp;
    }

    private Element buildDescripGrp(String p_value, String p_type) {
        Element grp = m_factory.createElement("descripGrp");
        Element node = grp.addElement("descrip").addText(p_value);
        node.addAttribute("type", p_type);
        return grp;
    }

    private Element buildSourceGrp(String p_value) {
        Element grp = m_factory.createElement("sourceGrp");
        Element node = grp.addElement("source").addText(p_value);
        return grp;
    }

    private Element buildLanguageGrp(String p_language) throws TermbaseException {
        String locale = m_termbase.getLocaleByLanguage(p_language);

        Element grp = m_factory.createElement("languageGrp");
        Element node = grp.addElement("language");
        node.addAttribute("name", p_language);
        node.addAttribute("locale", locale);
        return grp;
    }

}