Java tutorial
/******************************************************************************* * Copyright (C) 2013, 2014, 2015 by Inria and Paris-Sud University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package fr.inria.oak.paxquery.pact.io; import java.io.IOException; import java.util.Iterator; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.flink.api.common.io.statistics.BaseStatistics; import org.apache.flink.api.java.record.io.FileInputFormat; import org.apache.flink.api.java.record.operators.FileDataSource; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.fs.FileInputSplit; import org.apache.flink.types.Record; import org.apache.flink.types.StringValue; import fr.inria.oak.paxquery.common.xml.navigation.NavigationTreePattern; import fr.inria.oak.paxquery.common.xml.navigation.NavigationTreePatternUtils; import fr.inria.oak.paxquery.pact.configuration.PACTOperatorsConfiguration; import fr.inria.oak.paxquery.pact.operations.RecordOperations; import fr.inria.oak.paxquery.pact.operations.xml.navigation.SingleDocumentExtractor; /** * Input format that takes a collection of XML documents, and for each document generates * records based on the embeddings of a navigation tree pattern in the document. * */ public class XmlNavTreePatternInputFormat extends FileInputFormat { /** * */ private static final long serialVersionUID = -159202727923158624L; private static final Log logger = LogFactory.getLog(XmlNavTreePatternInputFormat.class); private String documentID; private boolean reachedEnd; private NavigationTreePattern navigationTreePattern; private XMLStreamReader streamReader; private SingleDocumentExtractor extractor; private boolean attachDocumentID; private Iterator<Record> pactRecordsIterator; @Override public void configure(Configuration parameters) { super.configure(parameters); this.init(); // read your own parameters final NavigationTreePattern ntp = NavigationTreePatternUtils.getTreePatternFromString( parameters.getString(PACTOperatorsConfiguration.NTP_STRING.toString(), null), "NavigationTreePattern"); this.navigationTreePattern = ntp; final boolean attachDocumentID = parameters .getBoolean(PACTOperatorsConfiguration.ATTACH_DOCUMENTID_BOOLEAN.toString(), false); this.attachDocumentID = attachDocumentID; } private void init() { this.reachedEnd = false; } @Override public FileBaseStatistics getStatistics(BaseStatistics cachedStatistics) { //TODO: How to gather statistics? return null; } @Override public boolean reachedEnd() throws IOException { if (this.reachedEnd) { this.init(); return true; } return false; } @Override public void open(FileInputSplit split) throws IOException { super.open(split); this.documentID = split.getPath().toString(); XMLInputFactory factory = XMLInputFactory.newInstance(); try { this.streamReader = factory.createXMLStreamReader(this.stream); } catch (XMLStreamException e) { logger.error("XMLStreamException", e); } this.extractor = new SingleDocumentExtractor(this.navigationTreePattern, this.streamReader); } /* @Override public boolean nextRecord(Record record) throws IOException { if(this.pactRecordsIterator!=null) { if(this.pactRecordsIterator.hasNext()) { if(this.attachDocumentID) { record.addField(new StringValue(this.documentID)); RecordOperations.concatenate(record,this.pactRecordsIterator.next()); } else this.pactRecordsIterator.next().copyTo(record); return true; } this.pactRecordsIterator = null; this.extractor.getRecords().clear(); } try { while(this.streamReader.hasNext()) { this.streamReader.next(); if(this.streamReader.getEventType() == XMLStreamConstants.START_ELEMENT) { this.extractor.startElement(); } else if(this.streamReader.getEventType() == XMLStreamConstants.END_ELEMENT) { this.extractor.endElement(); } else if(this.streamReader.getEventType() == XMLStreamConstants.CHARACTERS) { this.extractor.characters(); } else if(this.streamReader.getEventType() == XMLStreamConstants.END_DOCUMENT) { this.reachedEnd = true; } if(this.extractor.getRecords().size() != 0) { this.pactRecordsIterator = this.extractor.getRecords().iterator(); if(this.attachDocumentID) { record.addField(new StringValue(this.documentID)); RecordOperations.concatenate(record,this.pactRecordsIterator.next()); } else this.pactRecordsIterator.next().copyTo(record); return true; } } return false; } catch (XMLStreamException e) { logger.error("XMLStreamException", e); return false; } } */ @Override public Record nextRecord(Record record) throws IOException { if (this.pactRecordsIterator != null) { if (this.pactRecordsIterator.hasNext()) { if (this.attachDocumentID) { record.addField(new StringValue(this.documentID)); RecordOperations.concatenate(record, this.pactRecordsIterator.next()); } else this.pactRecordsIterator.next().copyTo(record); //return true; return record; } this.pactRecordsIterator = null; this.extractor.getRecords().clear(); } try { while (this.streamReader.hasNext()) { this.streamReader.next(); if (this.streamReader.getEventType() == XMLStreamConstants.START_ELEMENT) { this.extractor.startElement(); } else if (this.streamReader.getEventType() == XMLStreamConstants.END_ELEMENT) { this.extractor.endElement(); } else if (this.streamReader.getEventType() == XMLStreamConstants.CHARACTERS) { this.extractor.characters(); } else if (this.streamReader.getEventType() == XMLStreamConstants.END_DOCUMENT) { this.reachedEnd = true; } if (this.extractor.getRecords().size() != 0) { this.pactRecordsIterator = this.extractor.getRecords().iterator(); if (this.attachDocumentID) { record.addField(new StringValue(this.documentID)); RecordOperations.concatenate(record, this.pactRecordsIterator.next()); } else this.pactRecordsIterator.next().copyTo(record); //return true; return record; } } //return false; return null; } catch (XMLStreamException e) { logger.error("XMLStreamException", e); //return false; return null; } } // ============================================================================================ /** * Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent * fashion. * * @return A config builder for setting parameters. */ public static ConfigBuilder configureXmlNavInputFormat(FileDataSource target) { return new ConfigBuilder(target.getParameters()); } /** * Abstract builder used to set parameters to the input format's configuration in a fluent way. */ protected static abstract class AbstractConfigBuilder<T> { /** * The configuration into which the parameters will be written. */ protected final Configuration config; // -------------------------------------------------------------------- /** * Creates a new builder for the given configuration. * * @param targetConfig The configuration into which the parameters will be written. */ protected AbstractConfigBuilder(Configuration targetConfig) { this.config = targetConfig; } // -------------------------------------------------------------------- public T setNavigationTreePattern(NavigationTreePattern ntp) { this.config.setString(PACTOperatorsConfiguration.NTP_STRING.toString(), NavigationTreePatternUtils.getParsableStringFromTreePattern(ntp)); @SuppressWarnings("unchecked") T ret = (T) this; return ret; } public T setAttachDocumentID(boolean attachDocumentID) { this.config.setBoolean(PACTOperatorsConfiguration.ATTACH_DOCUMENTID_BOOLEAN.toString(), attachDocumentID); @SuppressWarnings("unchecked") T ret = (T) this; return ret; } } /** * A builder used to set parameters to the input format's configuration in a fluent way. */ public static class ConfigBuilder extends AbstractConfigBuilder<ConfigBuilder> { /** * Creates a new builder for the given configuration. * * @param targetConfig The configuration into which the parameters will be written. */ protected ConfigBuilder(Configuration targetConfig) { super(targetConfig); } } }