Java tutorial
/* * Copyright (C) 2013 Database Systems and Information Management Group, * TU Berlin * * cuttlefish is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * cuttlefish is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with cuttlefish; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA */ package de.tuberlin.dima.cuttlefish.preprocessing.parsing; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import de.tuberlin.dima.cuttlefish.preprocessing.NewsItem; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import javax.xml.parsers.DocumentBuilderFactory; import java.io.InputStream; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; class NewsItemXmlParser { private static final DocumentBuilderFactory DB_FACTORY = DocumentBuilderFactory.newInstance(); private NewsItemXmlParser() { } public static NewsItem toNewsItem(InputStream xml) throws Exception { Document doc = DB_FACTORY.newDocumentBuilder().parse(xml); doc.getDocumentElement().normalize(); String title = textContentOrEmptyString(doc, "title"); String headline = textContentOrEmptyString(doc, "headline"); String text = textContentOrEmptyString(doc, "text"); String dateline = textContentOrEmptyString(doc, "dateline"); Node newsItemNode = doc.getElementsByTagName("newsitem").item(0); int itemID = Integer.parseInt(newsItemNode.getAttributes().getNamedItem("itemid").getNodeValue()); DateFormat df = new SimpleDateFormat("yyyy-MM-dd"); Date date = df.parse(newsItemNode.getAttributes().getNamedItem("date").getNodeValue()); Multimap<String, String> codes = ArrayListMultimap.create(); NodeList codesNodes = doc.getElementsByTagName("codes"); int numCodes = codesNodes.getLength(); for (int codesIndex = 0; codesIndex < numCodes; codesIndex++) { Node codesNode = codesNodes.item(codesIndex); String codeClass = codesNode.getAttributes().getNamedItem("class").getNodeValue(); NodeList codeNodes = codesNode.getChildNodes(); for (int codeIndex = 0; codeIndex < codeNodes.getLength(); codeIndex++) { if ("code".equals(codeNodes.item(codeIndex).getNodeName())) { String codeValue = codeNodes.item(codeIndex).getAttributes().getNamedItem("code") .getNodeValue(); codes.put(codeClass, codeValue); } } } Map<String, String> dcs = Maps.newHashMap(); NodeList dcNodes = doc.getElementsByTagName("dc"); int numDcs = dcNodes.getLength(); for (int index = 0; index < numDcs; index++) { String dcElement = dcNodes.item(index).getAttributes().getNamedItem("element").getNodeValue(); String dcValue = dcNodes.item(index).getAttributes().getNamedItem("value").getNodeValue(); dcs.put(dcElement, dcValue); } return new NewsItem(itemID, date, title, headline, text, dateline, codes, dcs); } private static String textContentOrEmptyString(Document doc, String tag) { NodeList elements = doc.getElementsByTagName(tag); if (elements.getLength() > 0) { return elements.item(0).getTextContent(); } else { return ""; } } }