Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package chibi.gemmaanalysis.cli.deprecated; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Collection; import java.util.HashSet; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.lang3.StringUtils; import ubic.gemma.apps.ArrayDesignSequenceManipulatingCli; import ubic.gemma.model.expression.arrayDesign.ArrayDesign; import ubic.gemma.model.expression.designElement.CompositeSequence; import ubic.gemma.model.expression.designElement.CompositeSequenceService; import ubic.gemma.model.genome.biosequence.BioSequence; import ubic.gemma.model.genome.biosequence.BioSequenceService; import ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation; import ubic.gemma.model.genome.sequenceAnalysis.BlatAssociationService; import ubic.gemma.model.genome.sequenceAnalysis.BlatResult; import ubic.gemma.model.genome.sequenceAnalysis.BlatResultService; /** * Goes through the biosequences for array designs in the database and removes duplicates. * * @author pavlidis * @version $Id$ * @deprecated because we shouldn't really need this any more. */ @Deprecated public class BioSequenceCleanupCli extends ArrayDesignSequenceManipulatingCli { public static void main(String[] args) { BioSequenceCleanupCli p = new BioSequenceCleanupCli(); try { Exception ex = p.doWork(args); if (ex != null) { ex.printStackTrace(); } } catch (Exception e) { throw new RuntimeException(e); } } private BlatAssociationService blatAssociationService; private BlatResultService blatResultService; private BioSequenceService bss; private CompositeSequenceService css; private String file = null; private boolean justTesting = false; @SuppressWarnings("static-access") @Override protected void buildOptions() { super.buildOptions(); Option justTestingOption = OptionBuilder.withDescription("Set to run without any database modifications") .create("dryrun"); addOption(justTestingOption); Option sequenceNameList = OptionBuilder.hasArg().withArgName("file") .withDescription("File with list of biosequence ids to check.").create('b'); addOption(sequenceNameList); } @Override protected Exception doWork(String[] args) { Exception err = processCommandLine(args); if (err != null) return err; Collection<ArrayDesign> ads = new HashSet<ArrayDesign>(); if (!this.arrayDesignsToProcess.isEmpty()) { ads.addAll(this.arrayDesignsToProcess); } else if (file != null) { try (InputStream is = new FileInputStream(file); BufferedReader br = new BufferedReader(new InputStreamReader(is));) { String id = null; Collection<Long> ids = new HashSet<Long>(); while ((id = br.readLine()) != null) { if (StringUtils.isBlank(id)) { continue; } ids.add(Long.parseLong(id)); } Collection<BioSequence> bioSequences = bss.loadMultiple(ids); bioSequences = bss.thaw(bioSequences); processSequences(bioSequences); return null; } catch (Exception e) { return e; } } else { ads = this.arrayDesignService.loadAll(); } for (ArrayDesign design : ads) { log.info(design); design = unlazifyArrayDesign(design); Collection<BioSequence> bioSequences = new HashSet<BioSequence>(); for (CompositeSequence cs : design.getCompositeSequences()) { if (cs == null) continue; if (cs.getBiologicalCharacteristic() == null) continue; bioSequences.add(cs.getBiologicalCharacteristic()); } processSequences(bioSequences); // fast. log.info("Phase II starting"); // /////////////////////////////// // Second phase: make sure composite sequences don't refer to sequences that have duplicates based on name, // using stricter equality criteria. int i = 0; for (CompositeSequence cs : design.getCompositeSequences()) { if (++i % 500 == 0) { log.info("Processing: " + i + "/" + bioSequences.size() + " sequences"); } BioSequence anchorSeq = cs.getBiologicalCharacteristic(); if (anchorSeq == null) { continue; } Collection<BioSequence> seqs = bss.findByName(anchorSeq.getName()); // no evidence of duplicates? if (seqs.size() == 1) { continue; } seqs.remove(anchorSeq); seqs = this.bss.thaw(seqs); // ensure this group really does contain all duplicates. if (log.isDebugEnabled()) log.debug("Examining set of " + seqs.size() + " possible duplicates of " + anchorSeq); Collection<BioSequence> notDuplicate = new HashSet<BioSequence>(); for (BioSequence candidateForRemoval : seqs) { if (log.isDebugEnabled()) log.debug(" Examining: " + candidateForRemoval); assert !candidateForRemoval.equals(anchorSeq) : candidateForRemoval + " equals " + anchorSeq; if (!this.equals(anchorSeq, candidateForRemoval)) { notDuplicate.add(candidateForRemoval); } else { if (log.isDebugEnabled()) log.debug(" Duplicate: " + anchorSeq + " " + candidateForRemoval); } } seqs.removeAll(notDuplicate); for (BioSequence toChange : seqs) { if (log.isDebugEnabled()) log.debug("Processing " + toChange); if (!this.equals(anchorSeq, toChange)) { throw new IllegalStateException( "Sequences weren't equal " + anchorSeq + " and " + toChange); } switchAndDeleteExtra(anchorSeq, toChange); } } } return null; } @Override protected void processOptions() { super.processOptions(); if (this.hasOption("dryrun")) { this.justTesting = true; log.info("TEST MODE: NO DATABASE UPDATES WILL BE PERFORMED"); } if (this.hasOption('b')) { this.file = getOptionValue('b'); } bss = this.getBean(BioSequenceService.class); css = this.getBean(CompositeSequenceService.class); blatResultService = this.getBean(BlatResultService.class); blatAssociationService = this.getBean(BlatAssociationService.class); } /** * Test whether two sequences are effectively equal (ignore the ID) * * @param one * @param that * @return */ private boolean equals(BioSequence one, BioSequence that) { one = bss.thaw(one); that = bss.thaw(that); if (one.getSequenceDatabaseEntry() != null && that.getSequenceDatabaseEntry() != null && !one .getSequenceDatabaseEntry().getAccession().equals(that.getSequenceDatabaseEntry().getAccession())) return false; if (one.getTaxon() != null && that.getTaxon() != null && !one.getTaxon().equals(that.getTaxon())) return false; if (one.getName() != null && that.getName() != null && !one.getName().equals(that.getName())) return false; if (one.getSequence() != null && that.getSequence() != null && !one.getSequence().equals(that.getSequence())) return false; return true; } /** * @param bioSequences */ private void processSequences(Collection<BioSequence> bioSequences) { // /////////////////////////////// // First stage: fix biosequences that lack database entries, when there is one for another essentially // identical sequence (and the name is the same as the accession) for (BioSequence sequence : bioSequences) { Collection<BioSequence> reps = bss.findByName(sequence.getName()); if (reps.size() == 1) continue; reps = bss.thaw(reps); // pass 1: find an anchor. BioSequence anchor = null; for (BioSequence possibleAnchor : reps) { if (possibleAnchor.getSequenceDatabaseEntry() != null && possibleAnchor.getSequenceDatabaseEntry() .getAccession().equals(possibleAnchor.getName())) { anchor = possibleAnchor; } } if (anchor == null) continue; reps.remove(anchor); log.info("Examining duplicates of " + anchor); for (BioSequence rep : reps) { if (rep.getSequenceDatabaseEntry() != null) { if (rep.getSequenceDatabaseEntry().getAccession() .equals(anchor.getSequenceDatabaseEntry().getAccession())) { log.warn(anchor + " and " + rep + " have equivalent database entries for accession"); // they might have different names, but we don't care. One of them has to go. } else { log.warn( anchor + " and " + rep + " have distinct database entries for accession: skipping"); continue; } } log.info(sequence + " has a potential replica: " + rep); switchAndDeleteExtra(anchor, rep); } } } /** * @param keeper * @param toRemove */ private void switchAndDeleteExtra(BioSequence keeper, BioSequence toRemove) { // all composite sequences for bs2 will be switched to bs1. Collection<CompositeSequence> usingDuplicatedSequence = css.findByBioSequence(toRemove); css.thaw(usingDuplicatedSequence); for (CompositeSequence sequence : usingDuplicatedSequence) { log.info("Switching bioseq for " + sequence + " on " + sequence.getArrayDesign() + " from " + toRemove + " to " + keeper); if (!justTesting) sequence.setBiologicalCharacteristic(keeper); if (!justTesting) css.update(sequence); } Collection<BlatResult> blatResults = blatResultService.findByBioSequence(toRemove); for (BlatResult br : blatResults) { if (!justTesting) br.setQuerySequence(keeper); if (!justTesting) blatResultService.update(br); } Collection<BlatAssociation> bs2gps = blatAssociationService.find(toRemove); for (BlatAssociation bs2gp : bs2gps) { if (!justTesting) blatAssociationService.remove(bs2gp); } /* * This will fail with platforms that use AnnotationAssocations. That's easily fixed, but I prefer it this way * because those aren't real sequences. */ /* * Remove the other sequence. */ log.info("Deleting unused duplicate sequence " + toRemove); if (!justTesting) { bss.remove(toRemove); } } /* (non-Javadoc) * @see ubic.gemma.util.AbstractCLI#getCommandName() */ @Override public String getCommandName() { // TODO Auto-generated method stub return null; } }