com.twentyn.chemicalClassifier.Runner.java Source code

Introduction

Here is the source code for com.twentyn.chemicalClassifier.Runner.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.chemicalClassifier;

import uk.ac.cam.ch.wwmm.oscar.Oscar;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ChemicalStructure;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ResolvedNamedEntity;
import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * A quick and dirty OSCAR classification driver to run over Chris's TSV of chemical names and InChIs.
 */
public class Runner {
    public static void main(String[] args) throws Exception {
        BufferedReader reader = new BufferedReader(new FileReader(args[0]));
        BufferedWriter writer = new BufferedWriter(new FileWriter(args[1]));

        try {
            Oscar oscar = new Oscar();

            String line = null;
            /* NOTE: this is exactly the wrong way to write a TSV reader.  Caveat emptor.
             * See http://tburette.github.io/blog/2014/05/25/so-you-want-to-write-your-own-CSV-code/
             * and then use org.apache.commons.csv.CSVParser instead.
             */
            while ((line = reader.readLine()) != null) {
                // TSV means split on tabs!  Nothing else will do.
                List<String> fields = Arrays.asList(line.split("\t"));
                // Choke if our invariants aren't satisfied.  We expect ever line to have a name and an InChI.
                if (fields.size() != 2) {
                    throw new RuntimeException(
                            String.format("Found malformed line (all lines must have two fields: %s", line));
                }
                String name = fields.get(1);
                List<ResolvedNamedEntity> entities = oscar.findAndResolveNamedEntities(name);

                System.out.println("**********");
                System.out.println("Name: " + name);
                List<String> outputFields = new ArrayList<>(fields.size() + 1);
                outputFields.addAll(fields);
                if (entities.size() == 0) {
                    System.out.println("No match");
                    outputFields.add("noMatch");
                } else if (entities.size() == 1) {
                    ResolvedNamedEntity entity = entities.get(0);
                    NamedEntity ne = entity.getNamedEntity();
                    if (ne.getStart() != 0 || ne.getEnd() != name.length()) {
                        System.out.println("Partial match");
                        printEntity(entity);
                        outputFields.add("partialMatch");
                    } else {
                        System.out.println("Exact match");
                        printEntity(entity);
                        outputFields.add("exactMatch");
                        List<ChemicalStructure> structures = entity.getChemicalStructures(FormatType.STD_INCHI);
                        for (ChemicalStructure s : structures) {
                            outputFields.add(s.getValue());
                        }
                    }
                } else { // Multiple matches found!
                    System.out.println("Multiple matches");
                    for (ResolvedNamedEntity e : entities) {
                        printEntity(e);
                    }
                    outputFields.add("multipleMatches");
                }

                writer.write(String.join("\t", outputFields));
                writer.newLine();
            }
        } finally {
            writer.flush();
            writer.close();
        }
    }

    public static void printEntity(ResolvedNamedEntity e) {
        System.out.println("  " + e.getNamedEntity() + " @ " + e.getNamedEntity().getConfidence());
        System.out.println("    " + e.getChemicalStructures(FormatType.STD_INCHI));
    }
}