uk.ac.ebi.apps.benchmark.ChemicalNameSearch.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.ebi.apps.benchmark.ChemicalNameSearch.java

Source

/*
 * Copyright (c) 2013. EMBL, European Bioinformatics Institute
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package uk.ac.ebi.apps.benchmark;

import com.google.common.base.Joiner;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import org.apache.commons.cli.Option;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.math.stat.descriptive.SummaryStatistics;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import uk.ac.ebi.mdk.apps.CommandLineMain;
import uk.ac.ebi.mdk.domain.identifier.Identifier;
import uk.ac.ebi.mdk.service.query.name.*;
import uk.ac.ebi.mdk.tool.resolve.ChemicalFingerprintEncoder;
import uk.ac.ebi.mdk.tool.resolve.FingerprintEncoder;

import java.io.*;
import java.util.*;
import java.util.concurrent.Callable;

/**
 * Benchmarks the searching of chemical names measuring
 * <p/>
 * approximate=true/false
 * method=preferred name, all names
 * <p/>
 * string=fingerprint
 * Time
 * Recal
 * Precission
 */
public class ChemicalNameSearch extends CommandLineMain {

    private static final Logger LOGGER = Logger.getLogger("[BENCHMARK] ChemicalNameSearch");

    private List<String> names;
    private int found = 0;
    private int hits = 0;

    public static void main(String[] args) {
        BasicConfigurator.configure();
        new ChemicalNameSearch().process(args);
    }

    @Override
    public void setupOptions() {
        add(new Option("i", "input", true, "List of names to search (resource)"));
        add(new Option("f", "input-file", true, "List of names to search (file)"));
        add(new Option("a", "approximate", false, "Search using approximate match"));
        add(new Option("s", "service", true, "Name service to use (chebi, kegg, metacyc, lipidmaps, hmdb, all)"));
        add(new Option("g", "greedy", false, "Greedy Service searching (multiple services only)"));
    }

    public void testMultiple() {

        System.out.println("Testing multiple");

        Map<Class, NameService> services = new LinkedHashMap<Class, NameService>();

        StringBuilder sb = new StringBuilder("Combined: ");
        for (String serviceName : getCommandLine().getOptionValues("service")) {
            sb.append(serviceName).append(", ");
            NameService service = getNameService(serviceName);
            service.startup();
            services.put(service.getIdentifier().getClass(), service);
        }

        boolean greedy = has("g");

        Multimap<String, Identifier> results = ArrayListMultimap.create();

        long searchStart = System.currentTimeMillis();
        for (String name : names) {
            boolean foundHit = false;
            for (NameService service : services.values()) {

                Collection hits = service.searchName(name, has("a"));
                results.putAll(name, hits);
                if (!hits.isEmpty()) {
                    if (!foundHit) {
                        found++;
                        foundHit = true;
                    }
                    if (!greedy)
                        break;
                }

            }
        }
        long searchEnd = System.currentTimeMillis();

        Long searchTime = (searchEnd - searchStart);

        Multimap<String, Set<String>> nameResults = ArrayListMultimap.create();

        long resolveStart = System.currentTimeMillis();
        for (Map.Entry<String, Identifier> e : results.entries()) {
            Identifier id = e.getValue();
            nameResults.put(e.getKey(), new HashSet<String>(services.get(id.getClass()).getNames(id)));
        }
        long resolveEnd = System.currentTimeMillis();

        Long resolveTime = resolveEnd - resolveStart;

        int trueFound = getRealScore(nameResults, new ChemicalFingerprintEncoder(), null);

        SummaryStatistics statistics = getHitIndices(nameResults, new ChemicalFingerprintEncoder());

        String[] row = new String[] { sb.toString(), searchTime.toString(), resolveTime.toString(),
                Integer.toString(found), Integer.toString(trueFound), Double.toString(statistics.getMax()),
                Double.toString(statistics.getMean()), Double.toString(statistics.getStandardDeviation()) };

        System.out.println(Joiner.on("\t").join(row));

    }

    @Override
    public void process() {

        loadNames();

        if (getCommandLine().getOptionValues("s").length > 1) {
            testMultiple();
            return;
        }

        final NameService<Identifier> service = (NameService<Identifier>) getNameService(get("service", "none"));
        if (!service.startup()) {
            LOGGER.error("Unable to start service");
        }
        service.setMaxResults(100);

        Multimap<String, Identifier> results = ArrayListMultimap.create();

        // doing the search
        long searchStart = System.currentTimeMillis();
        for (String name : names) {
            Collection<? extends Identifier> hits = service.searchName(name, has("a"));
            results.putAll(name, hits);
            found += hits.isEmpty() ? 0 : 1;
        }
        long searchEnd = System.currentTimeMillis();

        Long searchTime = (searchEnd - searchStart);

        Multimap<String, Set<String>> nameResults = ArrayListMultimap.create();

        // resolving

        System.out.println("Transforming for performance test");

        long resolveStart = System.currentTimeMillis();
        for (Map.Entry<String, Identifier> e : results.entries()) {
            Identifier id = e.getValue();
            nameResults.put(e.getKey(), new HashSet<String>(service.getNames(id)));
        }
        long resolveEnd = System.currentTimeMillis();

        Long resolveTime = (resolveEnd - resolveStart);

        int trueFound = getRealScore(nameResults, new ChemicalFingerprintEncoder(), null);

        SummaryStatistics statistics = getHitIndices(nameResults, new ChemicalFingerprintEncoder());

        String[] row = new String[] { get("service", "unknown"), searchTime.toString(), resolveTime.toString(),
                Integer.toString(found), Integer.toString(trueFound), Double.toString(statistics.getMax()),
                Double.toString(statistics.getMean()), Double.toString(statistics.getStandardDeviation()) };

        System.out.println(Joiner.on("\t").join(row));

    }

    public NameService<? extends Identifier> getNameService(String name) {

        if (name.equals("kegg")) {
            return new KEGGCompoundNameService();
        } else if (name.equals("chebi")) {
            return new ChEBINameService();
        } else if (name.equals("metacyc")) {
            return new MetaCycNameService();
        } else if (name.equals("lipidmaps")) {
            return new LipidMapsNameService();
        } else if (name.equals("hmdb")) {
            return new HMDBNameService();
        }

        System.out.println("Invalid service name: valid=[kegg, chebi, metacyc, lipidmaps or hmdb]");

        super.showHelp();
        System.exit(0);

        return null;

    }

    private SummaryStatistics getHitIndices(Multimap<String, Set<String>> results, FingerprintEncoder encoder) {

        SummaryStatistics summaryStatistics = new SummaryStatistics();

        QUERY: for (String name : results.keySet()) {

            String normName = encoder.encode(name);

            StringBuffer buffer = new StringBuffer();

            List<Set<String>> hits = new ArrayList<Set<String>>(results.get(name));

            for (int i = 0; i < hits.size(); i++) {

                Set<String> hitNames = hits.get(i);

                for (String hit : hitNames) {

                    String normHit = encoder.encode(hit);

                    buffer.append("\t").append(hit);
                    buffer.append("\t").append(normHit);
                    buffer.append("\t").append(StringUtils.getLevenshteinDistance(normName, normHit));
                    buffer.append("\n");

                    if (normName.equals(normHit)) {
                        summaryStatistics.addValue(i + 1);
                        continue QUERY;
                    }

                }

            }

        }

        return summaryStatistics;

    }

    private int getRealScore(Multimap<String, Set<String>> results, FingerprintEncoder encoder, File file) {

        int hitCount = 0;

        FileWriter writer = null;

        try {
            if (file != null)
                writer = new FileWriter(file);
        } catch (IOException e) {
            System.err.println(e.getMessage());
        }

        QUERY: for (String name : results.keySet()) {

            String normName = encoder.encode(name);

            StringBuffer buffer = new StringBuffer();

            for (Set<String> hitNames : results.get(name)) {

                for (String hit : hitNames) {

                    String normHit = encoder.encode(hit);

                    buffer.append("\t").append(hit);
                    buffer.append("\t").append(normHit);
                    buffer.append("\t").append(StringUtils.getLevenshteinDistance(normName, normHit));
                    buffer.append("\n");

                    if (normName.equals(normHit)) {
                        hitCount++;
                        continue QUERY;
                    }

                }

            }

            try {
                if (writer != null) {
                    writer.write(name + "\n");
                    writer.write(buffer.toString() + "\n");
                }
            } catch (IOException e) {
                e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
            }

        }

        try {
            if (writer != null)
                writer.close();
        } catch (IOException e) {
            e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
        }

        System.out.println("Written unmatched results to :" + file);

        return hitCount;

    }

    private void loadNames() {
        names = getNames();
        LOGGER.info("Loaded " + names.size() + " metabolite names");
    }

    private Callable<Integer> create(final List<String> nameSublist, final NameService service,
            final Boolean approximate) {

        System.out.println("Creating query: " + nameSublist.size());

        service.startup();

        return new Callable<Integer>() {
            @Override
            public Integer call() throws Exception {
                int value = 0;
                long start = System.currentTimeMillis();
                for (String name : nameSublist) {
                    if (!service.searchName(name, approximate).isEmpty()) {
                        value++;
                    }
                }
                long end = System.currentTimeMillis();
                System.out.println("Completed " + (end - start) + " ms");
                return value;
            }
        };
    }

    private List<String> getNames() {

        Set<String> names = new HashSet<String>();

        // load resource
        if (has("i")) {

            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(getClass().getResourceAsStream(get("i", ""))));

            try {

                String line;

                while ((line = reader.readLine()) != null) {
                    names.add(line);
                }

            } catch (IOException e) {
                System.err.println(e.getMessage());
            } finally {
                try {
                    if (reader != null)
                        reader.close();
                } catch (IOException e) {
                    System.err.println(e.getMessage());
                }
            }

        }

        // load file
        if (has("f")) {

            BufferedReader reader = null;

            try {
                reader = new BufferedReader(new FileReader(getFile("f")));
                String line;

                while ((line = reader.readLine()) != null) {
                    names.add(line);
                }
            } catch (IOException e) {
                System.err.println(e);
            } finally {
                try {
                    if (reader != null)
                        reader.close();
                } catch (IOException e) {
                    System.err.println(e);
                }
            }

        }

        System.out.println("Loaded " + names.size());

        return new ArrayList<String>(names);

    }

}