uk.ac.ebi.mdk.service.loader.name.ChEBINameLoader.java Source code

Introduction

Here is the source code for uk.ac.ebi.mdk.service.loader.name.ChEBINameLoader.java
Source

/*
 * Copyright (c) 2013. EMBL, European Bioinformatics Institute
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package uk.ac.ebi.mdk.service.loader.name;

import au.com.bytecode.opencsv.CSVReader;
import com.google.common.base.Joiner;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import org.apache.log4j.Logger;
import uk.ac.ebi.caf.utility.preference.type.BooleanPreference;
import uk.ac.ebi.mdk.service.ProgressListener;
import uk.ac.ebi.mdk.service.ServicePreferences;
import uk.ac.ebi.mdk.service.exception.MissingLocationException;
import uk.ac.ebi.mdk.service.index.name.ChEBINameIndex;
import uk.ac.ebi.mdk.service.loader.AbstractChEBILoader;
import uk.ac.ebi.mdk.service.loader.location.RemoteLocation;
import uk.ac.ebi.mdk.service.loader.writer.DefaultNameIndexWriter;
import uk.ac.ebi.mdk.service.location.ResourceFileLocation;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

/**
 * ChEBINameLoader - 28.02.2012 <br/> <p/> Load ChEBI Names into a searchable
 * lucene index. Some generalisations are made to reduce the complexity of the
 * provided data. The loader treats systematic names as synonyms. When an entry
 * has multiple IUPAC names the first one is index and any others are added as
 * synonyms. <p/> The preferred name for each entry is fetched from ChEBI
 * Compounds resource first, if no preferred name exists then the name from the
 * ChEBI Names resource is used. <p/> This loader will load into {@see
 * ChEBINameIndex}.
 *
 * @author johnmay
 * @author $Author$ (this version)
 * @version $Rev$
 */
public class ChEBINameLoader extends AbstractChEBILoader {

    private static final Logger LOGGER = Logger.getLogger(ChEBINameLoader.class);

    public ChEBINameLoader() throws IOException {
        super(new ChEBINameIndex());

        // we use the ZIP location so it's slightly smaller to download
        // note: ChEBI Compounds as already required by the super class, but as they have the same
        //       name the resource is collapsed down to a single entry
        addRequiredResource("ChEBI Compounds",
                "compound.tsv flatfile from ChEBI, specifying preferred names, secondary and primary accessions",
                ResourceFileLocation.class, new RemoteLocation(
                        "ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/compounds.tsv"));
        addRequiredResource("ChEBI Names",
                "name.tsv flatfile from ChEBI, specifying names and their tpyes (e.g. IUPAC/Synonym)",
                ResourceFileLocation.class,
                new RemoteLocation("ftp://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/names.tsv"));

    }

    /**
     * @inheritDoc
     */
    @Override
    public void update() throws IOException {

        // get the preferred names first
        Map<String, String> preferredNameMap = getPreferredNameMap();

        ResourceFileLocation location = getLocation("ChEBI Names");
        CSVReader csv = new CSVReader(new InputStreamReader(location.open()), '\t', '\0');

        List<String> header = Arrays.asList(csv.readNext());
        int nameIndex = header.indexOf("NAME");
        int typeIndex = header.indexOf("TYPE");
        int accessionIndex = header.indexOf("COMPOUND_ID");
        int langIndex = header.indexOf("LANGUAGE");

        // what we treat as synonyms
        Set<String> synonymType = new HashSet<String>(Arrays.asList("SYSTEMATIC NAME", "SYNONYM"));

        Multimap<String, String> synonyms = HashMultimap.create();
        Multimap<String, String> iupac = HashMultimap.create();
        Multimap<String, String> inn = HashMultimap.create();
        Multimap<String, String> brand = HashMultimap.create();

        fireProgressUpdate("preloading primary identifiers...");
        createMap();
        fireProgressUpdate("done");

        int count = 0;
        String[] row = null;
        while (!isCancelled() && (row = csv.readNext()) != null) {

            if (row.length < header.size()) {
                LOGGER.error("malformed entry: " + Joiner.on(", ").join(row));
                continue;
            }

            String accession = getPrimaryIdentifier(row[accessionIndex]);
            String name = row[nameIndex];
            String type = row[typeIndex];
            String lang = row[langIndex];

            // only keep latin and english compound names
            if (!lang.equals("en") && !lang.equals("la")) {
                continue;
            }

            /* SYNONYM
             * Type matches anything we are considering a synonym
             */
            if (synonymType.contains(type))
                synonyms.put(accession, name);

            /* PREFERRED NAME
             * If there is already a preferred name in the map, check
             * whether the preferred name matches the 'NAME'. If it
             * doesn't add the 'NAME' to the synonym map
             */
            if (type.equals("NAME")) {
                if (preferredNameMap.containsKey(accession)) {
                    String preferredName = preferredNameMap.get(accession);
                    if (!preferredName.equals(name)) {
                        synonyms.put(accession, name);
                    }
                } else {
                    preferredNameMap.put(accession, name);
                }
            }

            /* IUPAC
             * Add the iupac name, if there is already an iupac
             * name and they are not matches, add the iupac name
             * to the synonym's list
             */
            if (type.equals("IUPAC NAME")) {
                if (!iupac.containsKey(accession)) {
                    iupac.put(accession, name);
                } else {
                    if (!iupac.get(accession).contains(name))
                        synonyms.put(accession, name);
                }

            }

            if (type.equals("BRAND NAME")) {
                if (!brand.containsKey(accession)) {
                    brand.put(accession, name);
                } else {
                    if (!brand.get(accession).contains(name))
                        synonyms.put(accession, name);
                }
            }

            if (type.equals("INN")) {
                if (!inn.containsKey(accession)) {
                    inn.put(accession, name);
                } else {
                    if (!inn.get(accession).contains(name))
                        synonyms.put(accession, name);
                }
            }

            // update progress (note this is only first step take have the progress on the file)
            if (++count % 150 == 0)
                fireProgressUpdate(location.progress() * 0.5);

        }

        // if not cancelled write the index
        if (!isCancelled()) {

            // get all the accessions to iterate through
            Set<String> accessions = new HashSet<String>(preferredNameMap.size() * 2);
            accessions.addAll(synonyms.keySet());
            accessions.addAll(iupac.keySet());
            accessions.addAll(preferredNameMap.keySet());
            accessions.addAll(brand.keySet());
            accessions.addAll(inn.keySet());

            DefaultNameIndexWriter writer = new DefaultNameIndexWriter(getIndex());

            double size = accessions.size();
            count = 0;

            // for each accession write the name's to the index
            for (String accession : accessions) {

                if (isActive(accession)) {

                    String preferredName = preferredNameMap.containsKey(accession) ? preferredNameMap.get(accession)
                            : "";
                    String iupacName = iupac.containsKey(accession) ? iupac.get(accession).iterator().next() : "";
                    String brandName = brand.containsKey(accession) ? brand.get(accession).iterator().next() : "";
                    String innName = inn.containsKey(accession) ? inn.get(accession).iterator().next() : "";

                    writer.write(accession, preferredName, iupacName, brandName, innName, synonyms.get(accession));

                }

                if (++count % 150 == 0)
                    fireProgressUpdate(0.5 + ((count / size) * 0.5));

            }

            writer.close();

        }

        location.close();

    }

    /**
     * Builds a map of preferred names from the compound file
     *
     * @return
     * @throws IOException
     * @throws MissingLocationException
     */
    private Map<String, String> getPreferredNameMap() throws IOException, MissingLocationException {

        Map<String, String> preferredNames = new HashMap<String, String>();

        // need to build the map first as it uses the same resource
        createMap();

        ResourceFileLocation location = getLocation("ChEBI Compounds");
        CSVReader csv = new CSVReader(new InputStreamReader(location.open()), '\t');

        List<String> header = Arrays.asList(csv.readNext());
        int nameIndex = header.indexOf("NAME");
        int accessionIndex = header.indexOf("CHEBI_ACCESSION");

        Pattern NULL_PATTERN = Pattern.compile("null");

        String[] row = null;
        while ((row = csv.readNext()) != null) {
            String name = row[nameIndex];
            String accession = row[accessionIndex];

            if (NULL_PATTERN.matcher(name).matches())
                continue;

            preferredNames.put(getPrimaryIdentifier(accession), name);

        }

        location.close();

        return preferredNames;
    }

    public static void main(String[] args) throws Exception {
        ChEBINameLoader loader = new ChEBINameLoader();
        if (loader.canBackup())
            loader.backup();

        long t0 = System.nanoTime();
        boolean update = loader.canUpdate();
        long t1 = System.nanoTime();

        BooleanPreference p = ServicePreferences.getInstance().getPreference("PROXY_SET");
        p.put(true);

        System.out.println((t1 - t0) / 1e6);
        loader.addProgressListener(new ProgressListener() {
            @Override
            public void progressed(String message) {
                System.out.print("\r" + message);
            }
        });

        System.out.println(update);

        if (update)
            loader.update();
    }

}