ubic.pubmedgate.interactions.DuplicateXMLParses.java Source code

Introduction

Here is the source code for ubic.pubmedgate.interactions.DuplicateXMLParses.java
Source

/*
 * The WhiteText project
 * 
 * Copyright (c) 2012 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package ubic.pubmedgate.interactions;

import java.io.FileWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.QName;
import org.dom4j.io.SAXReader;

import ubic.pubmedgate.Config;

/**
 * Bit of a hack to create additional XML entries for other parsers.
 * 
 * @author leon
 */
public class DuplicateXMLParses {
    protected static Log log = LogFactory.getLog(DuplicateXMLParses.class);

    public static void main(String[] args)/* throws Exception*/ {
        DuplicateXMLParses parser;
        //        parser = new DuplicateXMLParses( "WhiteTextUnseen" );
        try {
            parser = new DuplicateXMLParses("WhiteTextNegFixFullCountCheck");
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        //        parser = new DuplicateXMLParses( "WhiteTextNegFixTrain" );

    }

    public DuplicateXMLParses(String baseName) throws Exception {
        String filename = Config.config.getString("whitetext.iteractions.ppiBaseFolder")
                + "Corpora/Syntax-Tree-Learning-Format/" + baseName + ".xml";
        SAXReader saxReader = new SAXReader();
        Document document = saxReader.read(filename);
        String path;
        Map<String, String> attributes;

        // only run once
        path = "//corpus/document/sentence/sentenceanalyses/parses/parse";
        attributes = new HashMap<String, String>();
        attributes.put("tokenizer", "split");
        attributes.put("parser", "split-parse");

        fixXML(document, path, attributes);

        // path = "//corpus/document/sentence/sentenceanalyses/parses/parse";
        // attributes = new HashMap<String, String>();
        // // might dupicate
        // attributes.put( "parser", "Charniak-Lease" );
        //
        // fixXML( document, path, attributes );

        path = "//corpus/document/sentence/sentenceanalyses/tokenizations/tokenization";
        attributes = new HashMap<String, String>();
        attributes.put("tokenizer", "split");

        fixXML(document, path, attributes);

        // what about bracketings?

        // open xml file
        // iterate documents
        // iterate sentences
        // duplicate senteence parses
        FileWriter fout = new FileWriter(filename);
        document.write(fout);
        fout.close();
    }

    private void fixXML(Document document, String path, Map<String, String> attributes) {
        List list = document.selectNodes(path);
        Iterator iter = list.iterator();
        while (iter.hasNext()) {
            String attribute = attributes.keySet().iterator().next();
            Element el = (Element) iter.next();

            if (el.attribute("parser") != null
                    && el.attribute("parser").getValue().equals("Charniak-Johnson-McClosky")) {
                el.addAttribute("parser", "Charniak-Lease");
            }

            String tokenizer = el.attribute(attribute).getValue();
            String replacement = attributes.get(attribute);
            if (!tokenizer.equals(replacement)) {
                log.info(tokenizer + " -> " + replacement);
                Element newElement = el.createCopy();
                for (String attributeFix : attributes.keySet()) {
                    newElement.addAttribute(new QName(attributeFix), attributes.get(attributeFix));
                }
                el.getParent().add(newElement);
            }

        }
    }
}