de.tuberlin.dima.cuttlefish.preprocessing.parsing.NewsItemXmlParser.java Source code

Java tutorial

Introduction

Here is the source code for de.tuberlin.dima.cuttlefish.preprocessing.parsing.NewsItemXmlParser.java

Source

/*
 * Copyright (C) 2013 Database Systems and Information Management Group,
 * TU Berlin
 *
 * cuttlefish is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * cuttlefish is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with cuttlefish; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */

package de.tuberlin.dima.cuttlefish.preprocessing.parsing;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import de.tuberlin.dima.cuttlefish.preprocessing.NewsItem;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilderFactory;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;

class NewsItemXmlParser {

    private static final DocumentBuilderFactory DB_FACTORY = DocumentBuilderFactory.newInstance();

    private NewsItemXmlParser() {
    }

    public static NewsItem toNewsItem(InputStream xml) throws Exception {

        Document doc = DB_FACTORY.newDocumentBuilder().parse(xml);

        doc.getDocumentElement().normalize();

        String title = textContentOrEmptyString(doc, "title");
        String headline = textContentOrEmptyString(doc, "headline");
        String text = textContentOrEmptyString(doc, "text");
        String dateline = textContentOrEmptyString(doc, "dateline");

        Node newsItemNode = doc.getElementsByTagName("newsitem").item(0);
        int itemID = Integer.parseInt(newsItemNode.getAttributes().getNamedItem("itemid").getNodeValue());
        DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
        Date date = df.parse(newsItemNode.getAttributes().getNamedItem("date").getNodeValue());

        Multimap<String, String> codes = ArrayListMultimap.create();
        NodeList codesNodes = doc.getElementsByTagName("codes");
        int numCodes = codesNodes.getLength();
        for (int codesIndex = 0; codesIndex < numCodes; codesIndex++) {
            Node codesNode = codesNodes.item(codesIndex);
            String codeClass = codesNode.getAttributes().getNamedItem("class").getNodeValue();

            NodeList codeNodes = codesNode.getChildNodes();
            for (int codeIndex = 0; codeIndex < codeNodes.getLength(); codeIndex++) {

                if ("code".equals(codeNodes.item(codeIndex).getNodeName())) {
                    String codeValue = codeNodes.item(codeIndex).getAttributes().getNamedItem("code")
                            .getNodeValue();
                    codes.put(codeClass, codeValue);
                }
            }
        }

        Map<String, String> dcs = Maps.newHashMap();
        NodeList dcNodes = doc.getElementsByTagName("dc");
        int numDcs = dcNodes.getLength();
        for (int index = 0; index < numDcs; index++) {
            String dcElement = dcNodes.item(index).getAttributes().getNamedItem("element").getNodeValue();
            String dcValue = dcNodes.item(index).getAttributes().getNamedItem("value").getNodeValue();
            dcs.put(dcElement, dcValue);
        }

        return new NewsItem(itemID, date, title, headline, text, dateline, codes, dcs);
    }

    private static String textContentOrEmptyString(Document doc, String tag) {
        NodeList elements = doc.getElementsByTagName(tag);
        if (elements.getLength() > 0) {
            return elements.item(0).getTextContent();
        } else {
            return "";
        }
    }

}