Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package com.ostrichemulators.semtool.poi.main; import com.ostrichemulators.semtool.poi.main.xlsxml.LoadingSheetXmlHandler; import com.ostrichemulators.semtool.poi.main.xlsxml.SheetTypeXmlHandler; import com.ostrichemulators.semtool.poi.main.xlsxml.LoaderTabXmlHandler; import com.ostrichemulators.semtool.poi.main.xlsxml.MetadataTabXmlHandler; import com.ostrichemulators.semtool.poi.main.ImportValidationException.ErrorType; import com.ostrichemulators.semtool.poi.main.xlsxml.XlsXmlBase; import com.ostrichemulators.semtool.util.MultiMap; import com.ostrichemulators.semtool.util.Utility; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.StylesTable; import org.dom4j.Document; import org.dom4j.Element; import org.dom4j.Namespace; import org.dom4j.QName; import org.dom4j.io.SAXReader; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; /** * A class to read an xlsx file and produce an ImportData instance. * * @author ryan */ public class LowMemXlsReader { private static final Logger log = Logger.getLogger(LowMemXlsReader.class); private final LinkedHashMap<String, String> sheetNameIdLkp; private final List<String> sharedStrings; private final XSSFReader reader; private final OPCPackage pkg; private final StylesTable styles; private boolean lsInMem = false; public LowMemXlsReader(File filename) throws IOException { this(new BufferedInputStream(new FileInputStream(filename))); } public LowMemXlsReader(String filename) throws IOException { this(new File(filename)); } public LowMemXlsReader(InputStream stream) throws IOException { log.debug("reading with lo-mem xls reader"); sharedStrings = new ArrayList<>(); try { pkg = OPCPackage.open(stream); reader = new XSSFReader(pkg); styles = reader.getStylesTable(); sheetNameIdLkp = readSheetInfo(reader); populateSharedStrings(reader); } catch (OpenXML4JException e) { throw new IOException("unexpected error" + e.getLocalizedMessage(), e); } } public void keepSheetDataInMemory(boolean b) { lsInMem = b; } /** * Releases resources used by this reader */ public void release() { try { pkg.close(); sheetNameIdLkp.clear(); sharedStrings.clear(); } catch (Exception e) { log.error(e, e); } } /** * Gets the sheet types. If a loader tab exists, only those tabs will be * checked (and the metadata tab will be verified against what the loader tab * says). * * * @return * @throws ImportValidationException */ public Map<String, SheetType> getSheetTypes() throws ImportValidationException { Map<String, SheetType> types = new HashMap<>(); Set<String> tabsToCheck = new HashSet<>(); boolean checktypes = false; try { XMLReader parser = XMLReaderFactory.createXMLReader(); if (sheetNameIdLkp.containsKey("Loader")) { checktypes = true; try (InputStream is = reader.getSheet(sheetNameIdLkp.get("Loader"))) { LoaderTabXmlHandler handler = new LoaderTabXmlHandler(sharedStrings); parser.setContentHandler(handler); InputSource sheetSource = new InputSource(is); parser.parse(sheetSource); types.putAll(handler.getSheetTypes()); tabsToCheck.addAll(types.keySet()); if (tabsToCheck.isEmpty()) { throw new ImportValidationException(ErrorType.MISSING_DATA, "No data to process"); } } } else { tabsToCheck.addAll(sheetNameIdLkp.keySet()); } // now check the actual sheets SheetTypeXmlHandler handler = new SheetTypeXmlHandler(sharedStrings); parser.setContentHandler(handler); boolean seenMetadata = false; // we can only have 1 metadata tab for (String sheetname : tabsToCheck) { if (!sheetNameIdLkp.containsKey(sheetname)) { throw new ImportValidationException(ErrorType.MISSING_DATA, "Missing sheet: " + sheetname); } try (InputStream is = reader.getSheet(sheetNameIdLkp.get(sheetname))) { InputSource sheetSource = new InputSource(is); parser.parse(sheetSource); SheetType sheettype = handler.getSheetType(); boolean sheetsaysM = (SheetType.METADATA == sheettype); if (sheetsaysM) { if (seenMetadata) { throw new ImportValidationException(ErrorType.TOO_MUCH_DATA, "Too many metadata tabs in loading file"); } seenMetadata = true; } SheetType loadertype = types.get(sheetname); if (checktypes) { if ((SheetType.USUAL == loadertype && sheetsaysM) || SheetType.METADATA == loadertype && !sheetsaysM) { // if the loader or the sheet itself says its a metadata sheet, // then both types must agree throw new ImportValidationException(ErrorType.WRONG_TABTYPE, "Loader Sheet data type for " + sheetname + " conflicts with sheet type"); } } types.put(sheetname, sheettype); } } } catch (SAXException | IOException | InvalidFormatException e) { log.error(e, e); } return types; } public ImportMetadata getMetadata() throws ImportValidationException { ImportData id = new ImportData(); id.getMetadata().setNamespaces(Utility.DEFAULTNAMESPACES); try { XMLReader parser = XMLReaderFactory.createXMLReader(); Map<String, SheetType> types = getSheetTypes(); MultiMap<SheetType, String> mm = MultiMap.flip(types); // load the metadata sheet first, if we have one for (String metasheet : mm.getNN(SheetType.METADATA)) { try (InputStream is = reader.getSheet(sheetNameIdLkp.get(metasheet))) { MetadataTabXmlHandler handler = new MetadataTabXmlHandler(sharedStrings, id.getMetadata()); parser.setContentHandler(handler); InputSource sheetSource = new InputSource(is); parser.parse(sheetSource); id.setMetadata(handler.getMetadata()); } } } catch (SAXException | InvalidFormatException | IOException ife) { log.error(ife, ife); } return id.getMetadata(); } public ImportData getData() throws ImportValidationException { ImportData id = new ImportData(); id.getMetadata().setNamespaces(Utility.DEFAULTNAMESPACES); try { XMLReader parser = XMLReaderFactory.createXMLReader(); Map<String, SheetType> types = getSheetTypes(); MultiMap<SheetType, String> mm = MultiMap.flip(types); // load the metadata sheet first, if we have one for (String metasheet : mm.getNN(SheetType.METADATA)) { try (InputStream is = reader.getSheet(sheetNameIdLkp.get(metasheet))) { MetadataTabXmlHandler handler = new MetadataTabXmlHandler(sharedStrings, id.getMetadata()); parser.setContentHandler(handler); InputSource sheetSource = new InputSource(is); parser.parse(sheetSource); id.setMetadata(handler.getMetadata()); } types.remove(metasheet); // don't reprocess in the next loop } for (Map.Entry<String, SheetType> typeen : types.entrySet()) { String sheetname = typeen.getKey(); String sheetid = sheetNameIdLkp.get(sheetname); SheetType sheettype = typeen.getValue(); try (InputStream is = reader.getSheet(sheetid)) { if (SheetType.NODE == sheettype || SheetType.RELATION == sheettype) { LoadingSheetXmlHandler handler = new LoadingSheetXmlHandler(sharedStrings, styles, sheetname, id.getMetadata().getNamespaces(), lsInMem); parser.setContentHandler(handler); InputSource sheetSource = new InputSource(is); parser.parse(sheetSource); LoadingSheetData lsd = handler.getSheet(); if (lsd.isEmpty()) { lsd.release(); throw new ImportValidationException(ErrorType.NOT_A_LOADING_SHEET, "Sheet " + sheetname + " contains no loadable data"); } id.add(lsd); } } } } catch (SAXException | InvalidFormatException | IOException ife) { log.error(ife, ife); } if (id.isEmpty()) { id.release(); throw new ImportValidationException(ErrorType.MISSING_DATA, "No data to process"); } return id; } public Collection<String> getSheetNames() { return sheetNameIdLkp.keySet(); } /** * Gets sheet name-to-id mapping * * @param r * @return */ private LinkedHashMap<String, String> readSheetInfo(XSSFReader r) { LinkedHashMap<String, String> map = new LinkedHashMap<>(); try (InputStream is = r.getWorkbookData()) { SAXReader sax = new SAXReader(); Document doc = sax.read(is); Namespace ns = new Namespace("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"); Element sheets = doc.getRootElement().element("sheets"); for (Object sheet : sheets.elements("sheet")) { Element e = Element.class.cast(sheet); String name = e.attributeValue("name"); String id = e.attributeValue(new QName("id", ns)); map.put(name, id); } } catch (Exception e) { log.error(e, e); } return map; } private void populateSharedStrings(XSSFReader r) { try (InputStream is = r.getSharedStringsData()) { XMLReader parser = XMLReaderFactory.createXMLReader(); ContentHandler handler = new XlsXmlBase(new ArrayList<>()) { int count = 0; @Override public void startDocument() throws SAXException { super.startDocument(); count = 0; } @Override public void endDocument() throws SAXException { super.endDocument(); log.debug(count + " strings cached"); } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { setReading("t".equals(localName)); resetContents(); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (isReading()) { sharedStrings.add(getContents()); setReading(false); count++; } } }; parser.setContentHandler(handler); parser.parse(new InputSource(is)); } catch (Exception e) { log.error(e, e); } } }