pt.ua.tm.neji.util.obo.OBO2TSV.java Source code

Introduction

Here is the source code for pt.ua.tm.neji.util.obo.OBO2TSV.java
Source

/*
 * Copyright (c) 2016 BMD Software and University of Aveiro.
 *
 * Neji is a flexible and powerful platform for biomedical information extraction from text.
 *
 * This project is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
 * To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/.
 *
 * This project is a free software, you are free to copy, distribute, change and transmit it.
 * However, you may not use it for commercial purposes.
 *
 * It is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */

package pt.ua.tm.neji.util.obo;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import pt.ua.tm.neji.util.lvg.LVG;
import pt.ua.tm.neji.util.stopwords.Stopwords;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

/**
 * Created by david on 24/06/15.
 */
public class OBO2TSV {

    public static void main(String... args) {

        String inputFileName = args[0];
        String outputFileName = args[1];
        String semanticGroup = args[2];

        // Set output
        FileWriter fileWriter = null;
        try {
            fileWriter = new FileWriter(outputFileName);
        } catch (IOException e) {
            System.err.println(e);
            return;
        }

        try (BufferedReader br = new BufferedReader(new FileReader(inputFileName))) {
            String line;

            OBOTerm term = new OBOTerm();
            boolean inTerm = false;

            while ((line = br.readLine()) != null) {
                // process the line.
                if (line.contains("[Term]")) {
                    if (inTerm && term.getId() != null && !term.getNames().isEmpty()) {
                        writeTerm(fileWriter, term, semanticGroup);
                    }
                    term = new OBOTerm();
                    inTerm = true;
                } else if (line.contains("[Typedef]")) {
                    inTerm = false;
                } else if (line.contains("alt_id:")) {
                    continue;
                } else if (line.contains("ref:")) {
                    continue;
                } else if (line.contains("xref:")) {
                    continue;
                } else if (line.contains("def:")) {
                    continue;
                } else if (line.contains("id:")) {
                    if (!inTerm)
                        continue;
                    String id = line.substring(3).trim();
                    term.setId(id);
                } else if (line.contains("name:")) {
                    if (!inTerm)
                        continue;
                    String name = line.substring(5).trim();
                    name = format(name);
                    term.getNames().add(name);
                } else if (line.contains("synonym:")) {
                    if (!inTerm)
                        continue;
                    String synonym = line.substring(line.indexOf('"') + 1, line.lastIndexOf('"')).trim();
                    synonym = format(synonym);
                    term.getNames().add(synonym);
                }
            }
            writeTerm(fileWriter, term, semanticGroup);
        } catch (Exception ex) {
            System.err.println(ex);
            return;
        }

        try {
            fileWriter.close();
        } catch (IOException e) {
            System.err.println(e);
            return;
        }
    }

    private static void writeTerm(final FileWriter fileWriter, final OBOTerm term, final String semanticGroup)
            throws Exception {
        Set<String> names = new HashSet<>();
        for (String n : term.getNames()) {
            Set<String> variations = LVG.getLVGNames(n);

            String name = n.toLowerCase();
            if (name.length() > 2 && !Stopwords.isStopword(name)) {
                names.add(name);
            }
            if (!variations.isEmpty()) {
                names.addAll(variations);
            }
        }

        //        System.out.println(term.getId());
        //        System.out.println(StringUtils.join(names, '|'));

        if (!names.isEmpty()) {
            StringBuilder sb = new StringBuilder();
            sb.append(term.getId());
            sb.append("::");
            sb.append(semanticGroup);
            sb.append("\t");
            sb.append(StringUtils.join(names, '|'));
            sb.append("\n");

            fileWriter.write(sb.toString());
        }
    }

    private static String format(final String in) {
        String name = in;
        name = name.replaceAll("_", "");
        name = name.replaceAll("\\s+\\((Japanese|Spanish)\\)", "");
        name = name.replaceAll("(&#[0-9]+)", "$1;");
        name = StringEscapeUtils.unescapeXml(name);
        name = name.toLowerCase();
        return name;
    }

}