Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package chibi.gemmaanalysis.cli.deprecated; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.Writer; import java.sql.SQLException; import java.util.Collection; import java.util.Map; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.lang3.reflect.FieldUtils; import ubic.gemma.analysis.sequence.BlatAssociationScorer; import ubic.gemma.analysis.sequence.ProbeMapper; import ubic.gemma.analysis.sequence.ProbeMapperConfig; import ubic.gemma.apps.ArrayDesignProbeMapperCli; import ubic.gemma.externalDb.GoldenPathSequenceAnalysis; import ubic.gemma.loader.genome.BlatResultParser; import ubic.gemma.loader.genome.FastaParser; import ubic.gemma.loader.util.parser.TabDelimParser; import ubic.gemma.model.genome.Gene; import ubic.gemma.model.genome.Taxon; import ubic.gemma.genome.taxon.service.TaxonService; import ubic.gemma.model.genome.biosequence.BioSequence; import ubic.gemma.model.genome.biosequence.BioSequenceService; import ubic.gemma.model.genome.gene.GeneProduct; import ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation; import ubic.gemma.model.genome.sequenceAnalysis.BlatResult; import ubic.gemma.model.genome.sequenceAnalysis.BlatResultService; import ubic.gemma.persistence.Persister; import ubic.gemma.persistence.PersisterHelper; import ubic.gemma.util.AbstractSpringAwareCLI; /** * Given a blat result set for an array design, annotate and find the 3' locations for all the really good hits. This * CLI was written to take a file as input, and write results to a file or standard out, but has been modified to * process sequence ids from a file and load the results into the db. * <p> * * @author pavlidis * @version $Id$ * @see ArrayDesignProbeMapperCli for the tool we use day-to-day * @deprecated beause ArrayDesignProbeMapperCli has most of the functionality. */ @Deprecated public class ProbeMapperCli extends AbstractSpringAwareCLI { @Override public String getShortDesc() { return "Not really used much any more; look at ArrayDesignProbeMapperCli"; } ProbeMapper probeMapper; private static final String DEFAULT_DATABASE = "hg18"; /** * @param bestOutputFileName * @return * @throws IOException */ private static Writer getWriterForBestResults(String bestOutputFileName) throws IOException { Writer w = null; File o = new File(bestOutputFileName); w = new BufferedWriter(new FileWriter(o)); try { w.write(""); } catch (IOException e) { throw new RuntimeException("Could not write to " + bestOutputFileName); } return w; } public static void main(String[] args) { ProbeMapperCli p = new ProbeMapperCli(); Exception e = p.doWork(args); if (e != null) { System.err.println(e.getLocalizedMessage()); log.error(e, e); } } private String databaseName = DEFAULT_DATABASE; private String outputFileName = null; private String blatFileName = null; private String ncbiIdentifierFileName = null; private String fastaFileName = null; private String sequenceIdentifierFileName = null; private TaxonService taxonService; private Taxon taxon; private ProbeMapperConfig config; public ProbeMapperCli() { super(); } @SuppressWarnings("static-access") @Override protected void buildOptions() { Option blatResultOption = OptionBuilder.hasArg().withArgName("PSL file") .withDescription("Blat result file in PSL format").withLongOpt("blatfile").create('b'); addOption(blatResultOption); Option databaseNameOption = OptionBuilder.hasArg().withArgName("database") .withDescription("GoldenPath database id (default=" + DEFAULT_DATABASE + ")") .withLongOpt("database").create('d'); addOption(OptionBuilder.hasArg().withArgName("value") .withDescription( "Sequence identity threshold, default = " + ProbeMapperConfig.DEFAULT_IDENTITY_THRESHOLD) .withLongOpt("identityThreshold").create('i')); addOption(OptionBuilder.hasArg().withArgName("value") .withDescription("Blat score threshold, default = " + ProbeMapperConfig.DEFAULT_SCORE_THRESHOLD) .withLongOpt("scoreThreshold").create('s')); addOption(OptionBuilder.hasArg().withArgName("file name") .withDescription("File containing Genbank identifiers").withLongOpt("gbfile").create('g')); addOption(OptionBuilder.hasArg().withArgName("file name") .withDescription("File containing sequences in FASTA format").withLongOpt("fastaFile").create('f')); addOption(OptionBuilder.hasArg().withArgName("file name") .withDescription("File containing BioSequence primary keys (results are saved to database)") .create("seqIds")); addOption(OptionBuilder.hasArg().withArgName("file name").withDescription("Output file basename") .withLongOpt("outputFile").create('o')); addOption(databaseNameOption); } @SuppressWarnings("resource") @Override protected Exception doWork(String[] args) { try { Exception err = processCommandLine(args); if (err != null) return err; Writer resultsOut = null; Writer bestResultsOut = null; if (outputFileName != null) { String bestOutputFileName = outputFileName + ".best"; log.info("Saving best to " + bestOutputFileName); resultsOut = getWriterForBestResults(outputFileName); bestResultsOut = getWriterForBestResults(bestOutputFileName); } log.info("DatabaseHost = " + this.host + " User=" + this.username); if (blatFileName != null) { File f = new File(blatFileName); if (!f.canRead()) throw new IOException("Can't read file " + blatFileName); Map<String, Collection<BlatAssociation>> results = runOnBlatResults(new FileInputStream(f), resultsOut); printBestResults(results, bestResultsOut); } else if (ncbiIdentifierFileName != null) { File f = new File(ncbiIdentifierFileName); if (!f.canRead()) throw new IOException("Can't read file " + ncbiIdentifierFileName); Map<String, Collection<BlatAssociation>> results = runOnGbIds(new FileInputStream(f), resultsOut); printBestResults(results, bestResultsOut); } else if (fastaFileName != null) { File f = new File(fastaFileName); if (!f.canRead()) throw new IOException("Can't read file " + fastaFileName); Map<String, Collection<BlatAssociation>> results = runOnSequences(new FileInputStream(f), resultsOut); printBestResults(results, bestResultsOut); } else if (this.sequenceIdentifierFileName != null) { runOnBioSequences(); } else { String[] moreArgs = getArgs(); if (moreArgs.length == 0) { System.out.println( "You must provide either a Blat result file, a FASTA file, a Genbank identifier file, or some Genbank identifiers"); printHelp(); return new Exception("Missing genbank identifiers"); } GoldenPathSequenceAnalysis goldenPathDb; goldenPathDb = new GoldenPathSequenceAnalysis(this.databaseName); for (int i = 0; i < moreArgs.length; i++) { String gbId = moreArgs[i]; log.debug("Got " + gbId); Map<String, Collection<BlatAssociation>> results = probeMapper.processGbId(goldenPathDb, gbId); printBlatAssociations(resultsOut, results); printBestResults(results, bestResultsOut); } } if (resultsOut != null) resultsOut.close(); if (bestResultsOut != null) bestResultsOut.close(); } catch (Exception e) { return new RuntimeException(e); } return null; } /** * @throws SQLException * @throws FileNotFoundException * @throws IOException */ private void runOnBioSequences() throws SQLException, FileNotFoundException, IOException { GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(this.databaseName); InputStream stream = new FileInputStream(new File(this.sequenceIdentifierFileName)); TabDelimParser parser = new TabDelimParser(); parser.parse(stream); Collection<String[]> seqIds = parser.getResults(); log.info(seqIds.size() + " ids found in file"); BioSequenceService bss = this.getBean(BioSequenceService.class); Persister persisterHelper = this.getBean(PersisterHelper.class); BlatResultService blatResultService = this.getBean(BlatResultService.class); int count = 0; int hits = 0; for (String[] strings : seqIds) { try { if (strings.length == 0) continue; String idString = strings[0]; Long id = Long.parseLong(idString); BioSequence bs = bss.load(id); bs = bss.thaw(bs); if (bs == null) continue; final Collection<BlatResult> blatResults = blatResultService.findByBioSequence(bs); if (blatResults == null || blatResults.isEmpty()) continue; Map<String, Collection<BlatAssociation>> results = probeMapper.processBlatResults(goldenPathDb, blatResults); for (Collection<BlatAssociation> col : results.values()) { for (BlatAssociation association : col) { if (log.isDebugEnabled()) log.debug(association); } persisterHelper.persist(col); ++hits; } if (++count % 100 == 0) { log.info("Processed " + count + " sequences" + " with blat results; " + hits + " mappings found."); } } catch (NumberFormatException e) { log.warn(strings[0] + " was not a number"); } } } /** * Trim the results down to a set of "best" results. The results are sent to a provided writer * * @throws IOException * @param results * @param writer */ public void printBestResults(Map<String, Collection<BlatAssociation>> results, Writer writer) throws IOException { log.info("Preparing 'best' matches"); writeHeader(writer); for (String probe : results.keySet()) { BlatAssociation best = BlatAssociationScorer.scoreResults(results.get(probe), config); if (best == null) { continue; } writeDesignElementBlatAssociation(writer, best); } } /** * @param stream * @param output * @return */ public Map<String, Collection<BlatAssociation>> runOnSequences(InputStream stream, Writer output) { try { GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(this.databaseName); FastaParser parser = new FastaParser(); parser.parse(stream); writeHeader(output); Collection<BioSequence> sequences = parser.getResults(); log.debug("Parsed " + sequences.size() + " sequences from the stream"); Map<String, Collection<BlatAssociation>> allRes = probeMapper.processSequences(goldenPathDb, sequences, this.config); printBlatAssociations(output, allRes); return allRes; } catch (SQLException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } } @Override protected void processOptions() { super.processOptions(); probeMapper = this.getBean(ProbeMapper.class); taxonService = this.getBean(TaxonService.class); this.config = new ProbeMapperConfig(); if (hasOption('s')) { config.setBlatScoreThreshold(getDoubleOptionValue('s')); } if (hasOption('i')) { config.setIdentityThreshold(getDoubleOptionValue('i')); } if (hasOption('d')) { this.databaseName = getOptionValue('d'); } else { this.databaseName = DEFAULT_DATABASE; } guessTaxon(); if (hasOption('f')) { this.fastaFileName = getOptionValue('f'); } if (hasOption('b')) { this.blatFileName = getFileNameOptionValue('b'); } if (hasOption('g')) { this.ncbiIdentifierFileName = getFileNameOptionValue('g'); } if (hasOption("seqIds")) { this.sequenceIdentifierFileName = getFileNameOptionValue("seqIds"); } this.outputFileName = getOptionValue('o'); } private void guessTaxon() { if (this.databaseName.startsWith("hg")) { this.taxon = taxonService.findByCommonName("human"); } else if (this.databaseName.startsWith("mm")) { this.taxon = taxonService.findByCommonName("mouse"); } else if (this.databaseName.startsWith("rn")) { this.taxon = taxonService.findByCommonName("rat"); } else { throw new IllegalStateException("Unknown taxon for database: " + this.databaseName); } } /** * @param output * @param probeName * @param arrayName * @param ld * @throws IOException */ public void writeDesignElementBlatAssociation(Writer output, BlatAssociation association) throws IOException { BlatResult blatRes = association.getBlatResult(); String[] sa = splitBlatQueryName(blatRes); String arrayName = ""; String probeName = null; if (sa.length == 2) { arrayName = sa[0]; probeName = sa[1]; } else if (sa.length == 1) { probeName = sa[0]; } else { throw new RuntimeException("Query name was not in understood format"); } GeneProduct product = association.getGeneProduct(); Gene g = product.getGene(); output.write(probeName + "\t" + arrayName + "\t" + blatRes.getMatches() + "\t" + blatRes.getQuerySequence().getLength() + "\t" + (blatRes.getTargetEnd() - blatRes.getTargetStart()) + "\t" + blatRes.score() + "\t" + g.getOfficialSymbol() + "\t" + product.getNcbiGi() + "\t" + association.getThreePrimeDistance() + "\t" + association.getOverlap() + "\t" + blatRes.getTargetChromosome().getName() + "\t" + blatRes.getTargetStart() + "\t" + blatRes.getTargetEnd() + "\n"); } /** * @param blatResultInputStream * @param output * @return * @throws IOException * @throws SQLException * @throws InstantiationException * @throws IllegalAccessException * @throws ClassNotFoundException */ public Map<String, Collection<BlatAssociation>> runOnBlatResults(InputStream blatResultInputStream, Writer output) throws IOException, SQLException { GoldenPathSequenceAnalysis goldenPathAnalysis = new GoldenPathSequenceAnalysis(this.databaseName); BlatResultParser brp = new BlatResultParser(); brp.parse(blatResultInputStream); writeHeader(output); Collection<BlatResult> blatResults = brp.getResults(); // Fill in the taxon. assert this.taxon != null; for (BlatResult blatResult : blatResults) { blatResult.getQuerySequence().setTaxon(taxon); try { FieldUtils.writeField(blatResult.getTargetChromosome(), "taxon", taxon, true); FieldUtils.writeField(blatResult.getTargetAlignedRegion().getChromosome(), "taxon", taxon, true); } catch (IllegalAccessException e) { e.printStackTrace(); } } Map<String, Collection<BlatAssociation>> allRes = probeMapper.processBlatResults(goldenPathAnalysis, blatResults, this.config); printBlatAssociations(output, allRes); blatResultInputStream.close(); output.close(); return allRes; } /** * @param output * @param allRes * @throws IOException */ private void printBlatAssociations(Writer output, Map<String, Collection<BlatAssociation>> allRes) throws IOException { if (output == null) return; for (Collection<BlatAssociation> associations : allRes.values()) { for (BlatAssociation blatAssociation : associations) { this.writeDesignElementBlatAssociation(output, blatAssociation); } } } /** * @param stream containing genbank accessions, one per line; configuration has no effect. * @param writer * @return */ public Map<String, Collection<BlatAssociation>> runOnGbIds(InputStream stream, Writer writer) throws IOException, SQLException { GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(this.databaseName); TabDelimParser parser = new TabDelimParser(); parser.parse(stream); writeHeader(writer); Collection<String[]> genbankIds = parser.getResults(); log.debug("Parsed " + genbankIds.size() + " lines from the stream"); Map<String, Collection<BlatAssociation>> allRes = probeMapper.processGbIds(goldenPathDb, genbankIds); printBlatAssociations(writer, allRes); stream.close(); writer.close(); return allRes; } /** * @param blatRes * @return */ private String[] splitBlatQueryName(BlatResult blatRes) { assert blatRes != null; assert blatRes.getQuerySequence() != null; String qName = blatRes.getQuerySequence().getName(); String[] sa = qName.split(":"); // if ( sa.length < 2 ) throw new IllegalArgumentException( "Expected query name in format 'xxx:xxx'" ); return sa; } /** * Generate a header for the output file. TODO: should be optional. */ private void writeHeader(Writer output) throws IOException { output.write("Probe" + "\t" + "Array" + "\t" + "Blat.matches" + "\t" + "Blat.queryLength" + "\t" + "Blat.targetAlignmentLength" + "\t" + "Blat.score" + "\t" + "Gene.symbol" + "\t" + "Gene.NCBIid" + "\t" + "threePrime.distance" + "\t" + "exonOverlap" + "\t" + "Blat.Chromosome" + "\t" + "Blat.targetStart" + "\t" + "Blat.targetEnd" + "\n"); } /* * (non-Javadoc) * * @see ubic.gemma.util.AbstractCLI#getCommandName() */ @Override public String getCommandName() { return null; } }