pt.ua.tm.neji.evaluation.whatizit.XML2A1Module.java Source code

Java tutorial

Introduction

Here is the source code for pt.ua.tm.neji.evaluation.whatizit.XML2A1Module.java

Source

/*
 * Copyright (c) 2016 BMD Software and University of Aveiro.
 *
 * Neji is a flexible and powerful platform for biomedical information extraction from text.
 *
 * This project is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
 * To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/.
 *
 * This project is a free software, you are free to copy, distribute, change and transmit it.
 * However, you may not use it for commercial purposes.
 *
 * It is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */

package pt.ua.tm.neji.evaluation.whatizit;

import monq.jfa.ByteCharSource;
import monq.jfa.DfaRun;
import monq.jfa.Xml;
import org.apache.commons.lang3.StringEscapeUtils;
import pt.ua.tm.neji.core.module.BaseModule;
import pt.ua.tm.neji.evaluation.Concept;
import pt.ua.tm.neji.evaluation.ConceptList;
import pt.ua.tm.neji.exception.NejiException;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author David Campos (<a href="mailto:david.campos@ua.pt">david.campos@ua.pt</a>)
 * @version 1.0
 * @since 1.0
 */
public class XML2A1Module extends BaseModule {

    private Map<String, String> map;
    private StringBuilder text;
    private ConceptList conceptList;
    private int startTag;
    private int previousEnd;
    Pattern idsPattern = Pattern.compile("ids=\"(.+?)\"");

    private static final String tagSPEC = "z:species";
    private static final String tagPRGE = "z:uniprot";
    private static final String tagGO = "z:go";
    private static final String tagCHED = "z:chebi";
    private static final String tagDISO = "z:e";

    public XML2A1Module() throws NejiException {
        super(DfaRun.UNMATCHED_COPY);
        super.addActionToXMLTag(category_start, tagSPEC, tagPRGE, tagCHED, tagGO, tagDISO);
        super.addActionToXMLTag(spec_end, tagSPEC);
        super.addActionToXMLTag(prge_end, tagPRGE);
        super.addActionToXMLTag(ched_end, tagCHED);
        super.addActionToXMLTag(go_end, tagGO);
        super.addActionToXMLTag(diso_end, tagDISO);
        super.setEofAction(eof_action);

        //        try {
        //            Nfa nfa = new Nfa(Nfa.NOTHING);
        //
        ////            nfa.or(Xml.ETag("z:species"), category_end);
        ////            nfa.or(Xml.ETag("z:species"), category_end);
        ////            nfa.or(Xml.ETag("z:chebi"), category_end);
        ////            nfa.or(Xml.ETag("z:disease"), category_end);
        ////            nfa.or(Xml.ETag("z:e"), category_end);
        ////            nfa.or(Xml.ETag("z:uniprot"), category_end);
        //
        //            nfa.or(Xml.STag(tagSPEC), category_start);
        //            nfa.or(Xml.ETag(tagSPEC), spec_end);
        //
        //            nfa.or(Xml.STag(tagPRGE), category_start);
        //            nfa.or(Xml.ETag(tagPRGE), prge_end);
        //
        //            nfa.or(Xml.STag(tagCHED), category_start);
        //            nfa.or(Xml.ETag(tagCHED), ched_end);
        //
        //            nfa.or(Xml.STag(tagGO), category_start);
        //            nfa.or(Xml.ETag(tagGO), go_end);
        //
        //            nfa.or(Xml.STag(tagDISO), category_start);
        //            nfa.or(Xml.ETag(tagDISO), diso_end);
        //
        //            setNFA(nfa, DfaRun.UNMATCHED_COPY, eof);
        //        } catch (ReSyntaxException ex) {
        //            throw new NejiException(ex);
        //        }

        this.conceptList = new ConceptList();
        this.text = new StringBuilder();
        this.map = new HashMap<>();
        this.startTag = 0;
        this.previousEnd = 0;
    }

    private StartAction category_start = new StartAction() {
        @Override
        public void execute(StringBuffer yytext, int start) {
            startTag = start;

            String previous = yytext.substring(previousEnd, start);
            previous = StringEscapeUtils.unescapeXml(previous);
            previous = previous.replaceAll("+", "?");
            text.append(previous);
        }
    };

    // NOTE: in the EndAction, runner.collect is set to false... if this module
    // does not work, try swapping the Action class to StartAction

    private EndAction spec_end = new EndAction() {
        @Override
        public void execute(StringBuffer yytext, int start) {
            addTrigger(yytext, start, tagSPEC, "SPEC", true);
        }
    };

    private EndAction prge_end = new EndAction() {
        @Override
        public void execute(StringBuffer yytext, int start) {
            addTrigger(yytext, start, tagPRGE, "PRGE", true);
        }
    };

    private EndAction ched_end = new EndAction() {
        @Override
        public void execute(StringBuffer yytext, int start) {
            addTrigger(yytext, start, tagCHED, "CHED", true);
        }
    };

    private EndAction diso_end = new EndAction() {
        @Override
        public void execute(StringBuffer yytext, int start) {
            addTrigger(yytext, start, tagDISO, "DISO", true);
        }
    };

    private EndAction go_end = new EndAction() {
        @Override
        public void execute(StringBuffer yytext, int start) {
            String annotationTag = yytext.substring(startTag, start + tagGO.length() + 3);

            String annotation = annotationTag.substring(annotationTag.indexOf(">") + 1,
                    annotationTag.lastIndexOf("<"));
            annotation = StringEscapeUtils.unescapeXml(annotation);

            int startAnnotation = text.length();
            int endAnnotation = text.length() + annotation.length();

            text.append(annotation);

            map = Xml.splitElement(yytext, startTag);
            String onto = map.get("onto");
            String group;
            if (onto.equals("cellular_component")) {
                group = "COMP";
            } else if (onto.equals("biological_process") || onto.equals("molecular_function")) {
                group = "PROC_FUNC";
            } else {
                throw new RuntimeException("Non recognized group!");
            }
            Concept concept = new Concept(startAnnotation, endAnnotation, group, annotation);
            concept.getIdentifiers().addAll(getIDs(annotationTag, group));
            conceptList.add(concept);

            //            yytext.replace(startTag, start+7, annotation);

            previousEnd = start + tagGO.length() + 3;
        }
    };

    private EofAction eof_action = new EofAction() {
        @Override
        public void execute(StringBuffer yytext, int start) {
            String previous = yytext.substring(previousEnd, start);
            previous = StringEscapeUtils.unescapeXml(previous);
            previous = previous.replaceAll("+", "?");
            text.append(previous);
        }
    };

    public StringBuilder getText() {
        return text;
    }

    public ConceptList getConceptList() {
        return conceptList;
    }

    private List<String> getIDs(final String annotationTag, final String concept) {
        Matcher matcher = idsPattern.matcher(annotationTag);

        //        System.out.println(annotationTag);

        matcher.find();
        String idsText = matcher.group(1);
        String[] ids = idsText.split(",");

        List<String> idsList = new ArrayList<>();
        for (String id : ids) {

            if (concept.equals("PRGE")) {
                id = "UNIPROT:" + id;
            } else if (concept.equals("SPEC")) {
                id = "NCBI:" + id;
            } else if (concept.equals("CHED")) {
                id = "CHEBI:" + id;
            }

            idsList.add(id);
        }

        return idsList;
    }

    private void addTrigger(StringBuffer yytext, int start, String tag, String group, boolean add) {
        String annotationTag = yytext.substring(startTag, start + tag.length() + 3);

        String annotation = annotationTag.substring(annotationTag.indexOf(">") + 1, annotationTag.lastIndexOf("<"));
        annotation = StringEscapeUtils.unescapeXml(annotation);

        int startAnnotation = text.length();
        int endAnnotation = text.length() + annotation.length();

        text.append(annotation);

        if (add) {
            Concept concept = new Concept(startAnnotation, endAnnotation, group, annotation);
            concept.getIdentifiers().addAll(getIDs(annotationTag, group));
            conceptList.add(concept);
        }

        previousEnd = start + tag.length() + 3;
    }

    public void process(final InputStream in) throws NejiException {
        try {
            compile();
            DfaRun run = new DfaRun(getDFA());
            run.setIn(new ByteCharSource(in));
            run.filter();
        } catch (IOException ex) {
            throw new NejiException(ex);
        }
    }
}