com.joliciel.frenchTreebank.upload.TreebankSAXParser.java Source code

Java tutorial

Introduction

Here is the source code for com.joliciel.frenchTreebank.upload.TreebankSAXParser.java

Source

///////////////////////////////////////////////////////////////////////////////
//Copyright (C) 2012 Assaf Urieli
//
//This file is part of Talismane.
//
//Talismane is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Talismane is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Talismane.  If not, see <http://www.gnu.org/licenses/>.
//////////////////////////////////////////////////////////////////////////////
package com.joliciel.frenchTreebank.upload;

import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import com.joliciel.frenchTreebank.PhraseSubunit;
import com.joliciel.frenchTreebank.PhraseUnit;
import com.joliciel.frenchTreebank.Sentence;
import com.joliciel.frenchTreebank.TreebankFile;
import com.joliciel.frenchTreebank.TreebankService;

public class TreebankSAXParser extends DefaultHandler {
    private static final Log LOG = LogFactory.getLog(TreebankSAXParser.class);

    private TreebankFile treebankFile;
    private TreebankService treebankService;
    private Sentence sentence = null;
    private PhraseUnit phraseUnit = null;
    private PhraseSubunit phraseSubunit = null;
    private String tempVal = null;
    boolean isPhraseUnitCompound = false;

    public TreebankSAXParser() {

    }

    public void parseDocument(String filePath) {
        //get or create the treebank file
        treebankFile = this.getTreebankService().loadOrCreateTreebankFile(filePath);

        //get a factory
        SAXParserFactory spf = SAXParserFactory.newInstance();
        try {

            //get a new instance of parser
            SAXParser sp = spf.newSAXParser();

            //parse the file and also register this class for call backs
            sp.parse(filePath, this);

        } catch (SAXException se) {
            se.printStackTrace();
        } catch (ParserConfigurationException pce) {
            pce.printStackTrace();
        } catch (IOException ie) {
            ie.printStackTrace();
        }
    }

    //Event Handlers
    public void startElement(String uri, String localName, String qName, Attributes attributes)
            throws SAXException {
        //reset
        //LOG.debug("<" + qName + ">");
        // clear out tempVal whenever a new element is started
        tempVal = "";
        if (qName.equalsIgnoreCase("SENT")) {
            // a new sentence
            sentence = treebankService.newSentence();
            String sentenceNumber = attributes.getValue("nb");
            LOG.debug("Sentence number " + sentenceNumber);
            sentence.setSentenceNumber(sentenceNumber);
            sentence.setFile(treebankFile);
        } else if (qName.equalsIgnoreCase("w")) {
            // a new word or compound word
            if (phraseUnit == null) {
                String categoryCode = attributes.getValue("cat");
                String subCategoryCode = attributes.getValue("subcat");
                String morphologyCode = attributes.getValue("mph");
                String lemma = attributes.getValue("lemma");
                boolean isCompound = false;
                String isCompoundStr = attributes.getValue("compound");
                if (isCompoundStr != null && isCompoundStr.equalsIgnoreCase("yes"))
                    isCompound = true;
                String compoundId = attributes.getValue("id");
                String compoundNextId = attributes.getValue("next");
                String compoundPrevId = attributes.getValue("prev");
                //String isCompound = attributes.getValue("compound");
                // ignoring compound attribute as not reliable - instead relying on embedded words to indicate a compound phrase unit
                LOG.debug("Opening w " + lemma);
                phraseUnit = sentence.newPhraseUnit(categoryCode, subCategoryCode, morphologyCode, lemma,
                        isCompound, compoundId, compoundNextId, compoundPrevId);
            } else {
                isPhraseUnitCompound = true;
                String categoryCode = attributes.getValue("catint");
                String subCategoryCode = attributes.getValue("subcat");
                String morphologyCode = attributes.getValue("mph");
                LOG.debug("Opening subunit " + categoryCode);
                phraseSubunit = phraseUnit.newSubunit(categoryCode, subCategoryCode, morphologyCode);
            }
        } else if (qName.equalsIgnoreCase("text")) {
            // top level text tag, we don't need to do nothing
        } else {
            // must be a phrase
            if (sentence != null) {
                String functionCode = attributes.getValue("fct");
                LOG.debug("Opening phrase " + qName + ", " + functionCode);
                sentence.openPhrase(qName, functionCode);
            }
        }
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
        // add the characters to tempVal
        tempVal += new String(ch, start, length);
    }

    public void endElement(String uri, String localName, String qName) throws SAXException {
        //LOG.debug("</" + qName + ">");
        if (qName.equalsIgnoreCase("SENT")) {
            //add it to the list
            sentence.close();
            LOG.debug("Saving sentence");
            sentence.save();
            sentence = null;
        } else if (qName.equalsIgnoreCase("w")) {
            if (phraseSubunit != null) {
                LOG.debug("Closing subunit " + tempVal);
                phraseSubunit.setText(tempVal.trim());
                phraseSubunit = null;
            } else if (phraseUnit != null) {
                LOG.debug("Closing w " + tempVal);
                if (!isPhraseUnitCompound)
                    phraseUnit.setText(tempVal.trim());
                phraseUnit = null;
                isPhraseUnitCompound = false;
            }
        } else if (qName.equalsIgnoreCase("text")) {
            // top level text tag, we don't need to do nothing
        } else {
            // must be a phrase
            LOG.debug("Closing phrase " + qName);
            if (sentence != null)
                sentence.closePhrase();
        }
    }

    public TreebankService getTreebankService() {
        return treebankService;
    }

    public void setTreebankService(TreebankService treebankService) {
        this.treebankService = treebankService;
    }
}