com.celebihacker.ml.preprocess.rcv1.indexing.parsing.XmlToNewsItemParser.java Source code

Introduction

Here is the source code for com.celebihacker.ml.preprocess.rcv1.indexing.parsing.XmlToNewsItemParser.java
Source

/*
 * Copyright (C) 2013 Database Systems and Information Management Group, TU Berlin
 * 
 * cuttlefish is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later
 * version.
 * 
 * cuttlefish is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with cuttlefish; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

package com.celebihacker.ml.preprocess.rcv1.indexing.parsing;

import com.celebihacker.ml.preprocess.rcv1.indexing.types.NewsItem;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilderFactory;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;

public class XmlToNewsItemParser {

    private static final DocumentBuilderFactory DB_FACTORY = DocumentBuilderFactory.newInstance();

    private XmlToNewsItemParser() {
    }

    public static NewsItem toNewsItem(InputStream xml) throws Exception {
        Document doc = DB_FACTORY.newDocumentBuilder().parse(xml);

        doc.getDocumentElement().normalize();

        Node newsItemNode = doc.getElementsByTagName("newsitem").item(0);
        int itemID = Integer.parseInt(newsItemNode.getAttributes().getNamedItem("itemid").getNodeValue());

        String title = textContentOrEmptyString(doc, "title");
        String headline = textContentOrEmptyString(doc, "headline");
        String text = textContentOrEmptyString(doc, "text");
        String dateline = textContentOrEmptyString(doc, "dateline");

        DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
        Date date = df.parse(newsItemNode.getAttributes().getNamedItem("date").getNodeValue());

        Multimap<String, String> codes = ArrayListMultimap.create();
        NodeList codesNodes = doc.getElementsByTagName("codes");
        int numCodes = codesNodes.getLength();

        for (int codesIndex = 0; codesIndex < numCodes; codesIndex++) {
            Node codesNode = codesNodes.item(codesIndex);
            String codeClass = codesNode.getAttributes().getNamedItem("class").getNodeValue();

            NodeList codeNodes = codesNode.getChildNodes();
            for (int codeIndex = 0; codeIndex < codeNodes.getLength(); codeIndex++) {

                if ("code".equals(codeNodes.item(codeIndex).getNodeName())) {
                    String codeValue = codeNodes.item(codeIndex).getAttributes().getNamedItem("code")
                            .getNodeValue();
                    codes.put(codeClass, codeValue);
                }
            }
        }

        Map<String, String> dcs = Maps.newHashMap();
        NodeList dcNodes = doc.getElementsByTagName("dc");
        int numDcs = dcNodes.getLength();
        for (int index = 0; index < numDcs; index++) {
            String dcElement = dcNodes.item(index).getAttributes().getNamedItem("element").getNodeValue();
            String dcValue = dcNodes.item(index).getAttributes().getNamedItem("value").getNodeValue();
            dcs.put(dcElement, dcValue);
        }

        if (NewsItemV2Patcher.patch(codes))
            return new NewsItem(itemID, date, title, headline, text, dateline, codes, dcs);

        return null;
    }

    private static String textContentOrEmptyString(Document doc, String tag) {
        NodeList elements = doc.getElementsByTagName(tag);
        if (elements.getLength() > 0) {
            return elements.item(0).getTextContent();
        }

        return "";
    }
}