Example usage for java.lang String contains

Introduction

In this page you can find the example usage for java.lang String contains.

Prototype

public boolean contains(CharSequence s)

Source Link

Document

Returns true if and only if this string contains the specified sequence of char values.

Usage

From source file:imp.lstm.main.Driver.java

public static void main(String[] args)
        throws FileNotFoundException, IOException, ConfigurationException, InvalidParametersException {
    FileBasedConfigurationBuilder<PropertiesConfiguration> builder = new FileBasedConfigurationBuilder<>(
            PropertiesConfiguration.class).configure(
                    new Parameters().properties().setFileName(args[0]).setThrowExceptionOnMissing(true)
                            .setListDelimiterHandler(new DefaultListDelimiterHandler(';'))
                            .setIncludesAllowed(false));
    Configuration config = builder.getConfiguration();

    String inputSongPath = config.getString("input_song");
    String outputFolderPath = config.getString("output_folder");
    String autoEncoderParamsPath = config.getString("auto_encoder_params");
    String nameGeneratorParamsPath = config.getString("name_generator_params");
    String queueFolderPath = config.getString("queue_folder");
    String referenceQueuePath = config.getString("reference_queue", "nil");
    String inputCorpusFolder = config.getString("input_corpus_folder");
    boolean shouldWriteQueue = config.getBoolean("should_write_generated_queue");
    boolean frankensteinTest = config.getBoolean("queue_tests_frankenstein");
    boolean interpolateTest = config.getBoolean("queue_tests_interpolation");
    boolean iterateOverCorpus = config.getBoolean("iterate_over_corpus", false);
    boolean shouldGenerateSongTitle = config.getBoolean("generate_song_title");
    boolean shouldGenerateSong = config.getBoolean("generate_leadsheet");

    LogTimer.initStartTime(); //start our logging timer to keep track of our execution time
    LogTimer.log("Creating name generator...");

    //here is just silly code for generating name based on an LSTM lol $wag
    LSTM lstm = new LSTM();
    FullyConnectedLayer fullLayer = new FullyConnectedLayer(Operations.None);
    Loadable titleNetLoader = new Loadable() {
        @Override//from  w w  w  . j  av a  2s .c o m
        public boolean load(INDArray array, String path) {
            String car = pathCar(path);
            String cdr = pathCdr(path);
            switch (car) {
            case "full":
                return fullLayer.load(array, cdr);
            case "lstm":
                return lstm.load(array, cdr);
            default:
                return false;
            }
        }
    };

    LogTimer.log("Packing name generator from files...");
    (new NetworkConnectomeLoader()).load(nameGeneratorParamsPath, titleNetLoader);

    String characterString = " !\"'[],-.01245679:?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz";

    //Initialization
    LogTimer.log("Creating autoencoder...");
    int inputSize = 34;
    int outputSize = EncodingParameters.noteEncoder.getNoteLength();
    int featureVectorSize = 100;
    ProductCompressingAutoencoder autoencoder = new ProductCompressingAutoencoder(24, 48, 84 + 1, false); //create our network

    int numInterpolationDivisions = 5;

    //"pack" the network from weights and biases file directory
    LogTimer.log("Packing autoencoder from files");
    (new NetworkConnectomeLoader()).load(autoEncoderParamsPath, autoencoder);

    File[] songFiles;
    if (iterateOverCorpus) {
        songFiles = new File(inputCorpusFolder).listFiles();
    } else {
        songFiles = new File[] { new File(inputSongPath) };
    }
    for (File inputFile : songFiles) {
        (new NetworkConnectomeLoader()).refresh(autoEncoderParamsPath, autoencoder, "initialstate");
        String songTitle;
        if (shouldGenerateSong) {
            Random rand = new Random();
            AVector charOut = Vector.createLength(characterString.length());
            GroupedSoftMaxSampler sampler = new GroupedSoftMaxSampler(
                    new Group[] { new Group(0, characterString.length(), true) });
            songTitle = "";
            for (int i = 0; i < 50; i++) {
                charOut = fullLayer.forward(lstm.step(charOut));
                charOut = sampler.filter(charOut);
                int charIndex = 0;
                for (; charIndex < charOut.length(); charIndex++) {
                    if (charOut.get(charIndex) == 1.0) {
                        break;
                    }
                }
                songTitle += characterString.substring(charIndex, charIndex + 1);
            }
            songTitle = songTitle.trim();

            LogTimer.log("Generated song name: " + songTitle);
        } else {
            songTitle = "The Song We Never Name";
        }
        LogTimer.log("Reading file...");
        LeadSheetDataSequence inputSequence = LeadSheetIO.readLeadSheet(inputFile); //read our leadsheet to get a data vessel as retrieved in rbm-provisor
        LeadSheetDataSequence outputSequence = inputSequence.copy();

        outputSequence.clearMelody();
        if (interpolateTest) {
            LeadSheetDataSequence additionalOutput = outputSequence.copy();
            for (int i = 0; i < numInterpolationDivisions; i++) {
                outputSequence.concat(additionalOutput.copy());
            }
        }
        LeadSheetDataSequence decoderInputSequence = outputSequence.copy();

        LogTimer.startLog("Encoding data...");
        //TradingTimer.initStart(); //start our trading timer to keep track our our generation versus realtime play
        while (inputSequence.hasNext()) { //iterate through time steps in input data
            //TradingTimer.waitForNextTimedInput();
            autoencoder.encodeStep(inputSequence.retrieve()); //feed the resultant input vector into the network
            if (advanceDecoding) { //if we are using advance decoding (we start decoding as soon as we can)
                if (autoencoder.canDecode()) { //if queue has enough data to decode from
                    outputSequence.pushStep(null, null,
                            autoencoder.decodeStep(decoderInputSequence.retrieve())); //take sampled data for a timestep from autoencoder
                    //TradingTimer.logTimestep(); //log our time to TradingTimer so we can know how far ahead of realtime we are
                }
            }
        }
        LogTimer.endLog();

        if (shouldWriteQueue) {
            String queueFilePath = queueFolderPath + java.io.File.separator
                    + inputFile.getName().replace(".ls", ".q");
            FragmentedNeuralQueue currQueue = autoencoder.getQueue();
            currQueue.writeToFile(queueFilePath);
            LogTimer.log("Wrote queue " + inputFile.getName().replace(".ls", ".q") + " to file...");
        }
        if (shouldGenerateSong) {
            if (interpolateTest) {

                FragmentedNeuralQueue refQueue = new FragmentedNeuralQueue();
                refQueue.initFromFile(referenceQueuePath);

                FragmentedNeuralQueue currQueue = autoencoder.getQueue();
                //currQueue.writeToFile(queueFilePath);

                autoencoder.setQueue(currQueue.copy());
                while (autoencoder.hasDataStepsLeft()) { //we are done encoding all time steps, so just finish decoding!{
                    outputSequence.pushStep(null, null,
                            autoencoder.decodeStep(decoderInputSequence.retrieve())); //take sampled data for a timestep from autoencoder
                    //TradingTimer.logTimestep(); //log our time to TradingTimer so we can know how far ahead of realtime we are       
                }

                for (int i = 1; i <= numInterpolationDivisions; i++) {
                    System.out.println("Starting interpolation " + ((1.0 / numInterpolationDivisions) * (i)));
                    (new NetworkConnectomeLoader()).refresh(autoEncoderParamsPath, autoencoder, "initialstate");
                    FragmentedNeuralQueue currCopy = currQueue.copy();
                    currCopy.basicInterpolate(refQueue, (1.0 / numInterpolationDivisions) * (i));
                    autoencoder.setQueue(currCopy);
                    int timeStep = 0;
                    while (autoencoder.hasDataStepsLeft()) { //we are done encoding all time steps, so just finish decoding!{
                        System.out.println("interpolation " + i + " step " + ++timeStep);
                        outputSequence.pushStep(null, null,
                                autoencoder.decodeStep(decoderInputSequence.retrieve())); //take sampled data for a timestep from autoencoder
                        //TradingTimer.logTimestep(); //log our time to TradingTimer so we can know how far ahead of realtime we are       
                    }
                }

            }
            if (frankensteinTest) {
                LogTimer.startLog("Loading queues");
                File queueFolder = new File(queueFolderPath);
                int numComponents = config.getInt("frankenstein_num_components", 5);
                int numCombinations = config.getInt("frankenstein_num_combinations", 6);
                double interpolationMagnitude = config.getDouble("frankenstein_magnitude", 2.0);
                if (queueFolder.isDirectory()) {
                    File[] queueFiles = queueFolder.listFiles(new FilenameFilter() {
                        @Override
                        public boolean accept(File dir, String name) {
                            return name.contains(".q");
                        }
                    });

                    List<File> fileList = new ArrayList<>();
                    for (File file : queueFiles) {
                        fileList.add(file);
                    }
                    Collections.shuffle(fileList);
                    int numSelectedFiles = (numComponents > queueFiles.length) ? queueFiles.length
                            : numComponents;

                    for (int i = 0; i < queueFiles.length - numSelectedFiles; i++) {
                        fileList.remove(fileList.size() - 1);
                    }
                    List<FragmentedNeuralQueue> queuePopulation = new ArrayList<>(fileList.size());
                    songTitle += " - a mix of ";
                    for (File file : fileList) {
                        FragmentedNeuralQueue newQueue = new FragmentedNeuralQueue();
                        newQueue.initFromFile(file.getPath());
                        queuePopulation.add(newQueue);
                        songTitle += file.getName().replaceAll(".ls", "") + ", ";
                    }
                    LogTimer.endLog();

                    LeadSheetDataSequence additionalOutput = outputSequence.copy();
                    for (int i = 1; i < numCombinations; i++) {
                        outputSequence.concat(additionalOutput.copy());
                    }
                    decoderInputSequence = outputSequence.copy();

                    FragmentedNeuralQueue origQueue = autoencoder.getQueue();

                    for (int i = 0; i < numCombinations; i++) {

                        LogTimer.startLog("Performing queue interpolation...");
                        AVector combinationStrengths = Vector.createLength(queuePopulation.size());
                        Random vectorRand = new Random(i);
                        for (int j = 0; j < combinationStrengths.length(); j++) {
                            combinationStrengths.set(j, vectorRand.nextDouble());
                        }
                        combinationStrengths.divide(combinationStrengths.elementSum());
                        FragmentedNeuralQueue currQueue = origQueue.copy();
                        for (int k = 0; k < combinationStrengths.length(); k++) {
                            currQueue.basicInterpolate(queuePopulation.get(k),
                                    combinationStrengths.get(k) * interpolationMagnitude);
                        }
                        LogTimer.endLog();
                        autoencoder.setQueue(currQueue);
                        LogTimer.startLog("Refreshing autoencoder state...");
                        (new NetworkConnectomeLoader()).refresh(autoEncoderParamsPath, autoencoder,
                                "initialstate");
                        LogTimer.endLog();
                        LogTimer.startLog("Decoding segment...");
                        while (autoencoder.hasDataStepsLeft()) { //we are done encoding all time steps, so just finish decoding!{
                            outputSequence.pushStep(null, null,
                                    autoencoder.decodeStep(decoderInputSequence.retrieve())); //take sampled data for a timestep from autoencoder
                            //TradingTimer.logTimestep(); //log our time to TradingTimer so we can know how far ahead of realtime we are       
                        }
                        LogTimer.endLog();
                    }

                }
            }

            while (autoencoder.hasDataStepsLeft()) { //we are done encoding all time steps, so just finish decoding!{
                outputSequence.pushStep(null, null, autoencoder.decodeStep(decoderInputSequence.retrieve())); //take sampled data for a timestep from autoencoder
                //TradingTimer.logTimestep(); //log our time to TradingTimer so we can know how far ahead of realtime we are       
            }
            LogTimer.log("Writing file...");

            String outputFilename = outputFolderPath + java.io.File.separator
                    + inputFile.getName().replace(".ls", "_Output"); //we'll write our generated file with the same name plus "_Output"
            LeadSheetIO.writeLeadSheet(outputSequence, outputFilename, songTitle);
            System.out.println(outputFilename);
        } else {
            autoencoder.setQueue(new FragmentedNeuralQueue());
        }
    }
    LogTimer.log("Process finished"); //Done!

}

From source file:org.eclipse.swt.snippets.SnippetLauncher.java

public static void main(String[] args) {
    File sourceDir = SnippetsConfig.SNIPPETS_SOURCE_DIR;
    boolean hasSource = sourceDir.exists();
    int count = 500;
    if (hasSource) {
        File[] files = sourceDir.listFiles();
        if (files.length > 0)
            count = files.length;/*  w ww.j a v a 2  s.  co  m*/
    }
    for (int i = 1; i < count; i++) {
        if (SnippetsConfig.isPrintingSnippet(i))
            continue; // avoid printing to printer
        String className = "Snippet" + i;
        Class<?> clazz = null;
        try {
            clazz = Class.forName(SnippetsConfig.SNIPPETS_PACKAGE + "." + className);
        } catch (ClassNotFoundException e) {
        }
        if (clazz != null) {
            System.out.println("\n" + clazz.getName());
            if (hasSource) {
                File sourceFile = new File(sourceDir, className + ".java");
                try (FileReader reader = new FileReader(sourceFile);) {
                    char[] buffer = new char[(int) sourceFile.length()];
                    reader.read(buffer);
                    String source = String.valueOf(buffer);
                    int start = source.indexOf("package");
                    start = source.indexOf("/*", start);
                    int end = source.indexOf("* For a list of all");
                    System.out.println(source.substring(start, end - 3));
                    boolean skip = false;
                    String platform = SWT.getPlatform();
                    if (source.contains("PocketPC")) {
                        platform = "PocketPC";
                        skip = true;
                    } else if (source.contains("OpenGL")) {
                        platform = "OpenGL";
                        skip = true;
                    } else if (source.contains("JavaXPCOM")) {
                        platform = "JavaXPCOM";
                        skip = true;
                    } else {
                        String[] platforms = { "win32", "gtk" };
                        for (int p = 0; p < platforms.length; p++) {
                            if (!platforms[p].equals(platform) && source.contains("." + platforms[p])) {
                                platform = platforms[p];
                                skip = true;
                                break;
                            }
                        }
                    }
                    if (skip) {
                        System.out.println("...skipping " + platform + " example...");
                        continue;
                    }
                } catch (Exception e) {
                }
            }
            Method method = null;
            String[] param = SnippetsConfig.getSnippetArguments(i);
            try {
                method = clazz.getMethod("main", param.getClass());
            } catch (NoSuchMethodException e) {
                System.out.println("   Did not find main(String [])");
            }
            if (method != null) {
                try {
                    method.invoke(clazz, new Object[] { param });
                } catch (IllegalAccessException e) {
                    System.out.println("   Failed to launch (illegal access)");
                } catch (IllegalArgumentException e) {
                    System.out.println("   Failed to launch (illegal argument to main)");
                } catch (InvocationTargetException e) {
                    System.out.println("   Exception in Snippet: " + e.getTargetException());
                }
            }
        }
    }
}

From source file:edu.msu.cme.rdp.kmer.cli.FastKmerFilter.java

public static void main(String[] args) throws Exception {
    final KmerSet<Set<RefKmer>> kmerSet;
    final SeqReader queryReader;
    final SequenceType querySeqType;
    final File queryFile;
    final KmerStartsWriter out;
    final boolean translQuery;
    final int wordSize;
    final int translTable;
    final boolean alignedSeqs;
    final List<String> refLabels = new ArrayList();
    final int maxThreads;
    final int trieWordSize;

    try {/*from  w  w  w .  j  a va2s. com*/
        CommandLine cmdLine = new PosixParser().parse(options, args);
        args = cmdLine.getArgs();

        if (args.length < 3) {
            throw new Exception("Unexpected number of arguments");
        }

        if (cmdLine.hasOption("out")) {
            out = new KmerStartsWriter(cmdLine.getOptionValue("out"));
        } else {
            out = new KmerStartsWriter(System.out);
        }

        if (cmdLine.hasOption("aligned")) {
            alignedSeqs = true;
        } else {
            alignedSeqs = false;
        }

        if (cmdLine.hasOption("transl-table")) {
            translTable = Integer.valueOf(cmdLine.getOptionValue("transl-table"));
        } else {
            translTable = 11;
        }

        if (cmdLine.hasOption("threads")) {
            maxThreads = Integer.valueOf(cmdLine.getOptionValue("threads"));
        } else {
            maxThreads = Runtime.getRuntime().availableProcessors();
        }

        queryFile = new File(args[1]);
        wordSize = Integer.valueOf(args[0]);
        SequenceType refSeqType = null;

        querySeqType = SeqUtils.guessSequenceType(queryFile);
        queryReader = new SequenceReader(queryFile);

        if (querySeqType == SequenceType.Protein) {
            throw new Exception("Expected nucl query sequences");
        }

        refSeqType = SeqUtils
                .guessSequenceType(new File(args[2].contains("=") ? args[2].split("=")[1] : args[2]));

        translQuery = refSeqType == SequenceType.Protein;

        if (translQuery && wordSize % 3 != 0) {
            throw new Exception("Word size must be a multiple of 3 for nucl ref seqs");
        }

        if (translQuery) {
            trieWordSize = wordSize / 3;
        } else {
            trieWordSize = wordSize;
        }
        kmerSet = new KmerSet<Set<RefKmer>>();//new KmerTrie(trieWordSize, translQuery);

        for (int index = 2; index < args.length; index++) {
            String refName;
            String refFileName = args[index];
            if (refFileName.contains("=")) {
                String[] lexemes = refFileName.split("=");
                refName = lexemes[0];
                refFileName = lexemes[1];
            } else {
                String tmpName = new File(refFileName).getName();
                if (tmpName.contains(".")) {
                    refName = tmpName.substring(0, tmpName.lastIndexOf("."));
                } else {
                    refName = tmpName;
                }
            }

            File refFile = new File(refFileName);

            if (refSeqType != SeqUtils.guessSequenceType(refFile)) {
                throw new Exception(
                        "Reference file " + refFile + " contains " + SeqUtils.guessFileFormat(refFile)
                                + " sequences but expected " + refSeqType + " sequences");
            }

            SequenceReader seqReader = new SequenceReader(refFile);
            Sequence seq;

            while ((seq = seqReader.readNextSequence()) != null) {
                if (seq.getSeqName().startsWith("#")) {
                    continue;
                }

                KmerGenerator kmers;
                try {
                    if (translQuery) { //protein ref
                        kmers = new ProtKmerGenerator(seq.getSeqString(), trieWordSize, alignedSeqs);
                    } else {
                        kmers = new NuclKmerGenerator(seq.getSeqString(), trieWordSize, alignedSeqs);
                    }
                    while (kmers.hasNext()) {
                        Kmer temp = kmers.next();
                        long[] next = temp.getLongKmers();
                        Set<RefKmer> refKmers = kmerSet.get(next);
                        if (refKmers == null) {
                            refKmers = new HashSet();
                            kmerSet.add(next, refKmers);
                        }

                        RefKmer kmerRef = new RefKmer();
                        kmerRef.modelPos = kmers.getPosition();
                        kmerRef.refFileIndex = refLabels.size();
                        kmerRef.refSeqid = seq.getSeqName();
                        refKmers.add(kmerRef);
                    }
                } catch (IllegalArgumentException ex) {
                    //System.err.println(seq.getSeqName()+ " " + ex.getMessage());
                }
            }
            seqReader.close();

            refLabels.add(refName);
        }

    } catch (Exception e) {
        new HelpFormatter().printHelp(
                "KmerSearch <kmerSize> <query_file> [name=]<ref_file> ...\nkmerSize should be multiple of 3, (recommend 45, minimum 30, maximum 63) ",
                options);
        e.printStackTrace();
        System.exit(1);
        throw new RuntimeException("Stupid jvm"); //While this will never get thrown it is required to make sure javac doesn't get confused about uninitialized variables
    }

    long startTime = System.currentTimeMillis();
    long seqCount = 0;
    final int maxTasks = 25000;

    System.err.println("Starting kmer mapping at " + new Date());
    System.err.println("*  Number of threads:       " + maxThreads);
    System.err.println("*  References:              " + refLabels);
    System.err.println("*  Reads file:              " + queryFile);
    System.err.println("*  Kmer length:             " + trieWordSize);
    System.err.println("*  Kmer Refset Size:        " + kmerSet.size());

    final AtomicInteger processed = new AtomicInteger();
    final AtomicInteger outstandingTasks = new AtomicInteger();

    ExecutorService service = Executors.newFixedThreadPool(maxThreads);

    Sequence querySeq;

    while ((querySeq = queryReader.readNextSequence()) != null) {
        seqCount++;

        String seqString = querySeq.getSeqString();

        if ((!translQuery && seqString.length() < wordSize)
                || (translQuery && seqString.length() < wordSize + 2)) {
            //System.err.println(querySeq.getSeqName() + "\t" + seqString.length());
            continue;
        }

        final Sequence threadSeq = querySeq;

        Runnable r = new Runnable() {

            public void run() {
                try {
                    processSeq(threadSeq, refLabels, kmerSet, out, wordSize, translQuery, translTable, false);
                    processSeq(threadSeq, refLabels, kmerSet, out, wordSize, translQuery, translTable, true);

                    processed.incrementAndGet();
                    outstandingTasks.decrementAndGet();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        };

        outstandingTasks.incrementAndGet();
        service.submit(r);

        while (outstandingTasks.get() >= maxTasks)
            ;

        if ((processed.get() + 1) % 1000000 == 0) {
            System.err.println("Processed " + processed + " sequences in "
                    + (System.currentTimeMillis() - startTime) + " ms");
        }
    }

    service.shutdown();
    service.awaitTermination(1, TimeUnit.DAYS);

    System.err.println("Finished Processed " + processed + " sequences in "
            + (System.currentTimeMillis() - startTime) + " ms");

    out.close();
}

From source file:gov.nih.nci.ncicb.tcga.dcc.dam.util.TempClinicalDataLoader.java

public static void main(String[] args) {
    // first get the db connection properties
    String url = urlSet.get(args[1]);
    String user = args[2];/* w ww.  ja  va2  s. com*/
    String word = args[3];

    // make sure we have the Oracle driver somewhere
    try {
        Class.forName("oracle.jdbc.OracleDriver");
        Class.forName("org.postgresql.Driver");
    } catch (Exception x) {
        System.out.println("Unable to load the driver class!");
        System.exit(0);
    }
    // connect to the database
    try {
        dbConnection = DriverManager.getConnection(url, user, word);
        ClinicalBean.setDBConnection(dbConnection);
    } catch (SQLException x) {
        x.printStackTrace();
        System.exit(1);
    }

    final String xmlList = args[0];
    BufferedReader br = null;
    try {
        final Map<String, String> clinicalFiles = new HashMap<String, String>();
        final Map<String, String> biospecimenFiles = new HashMap<String, String>();
        final Map<String, String> fullFiles = new HashMap<String, String>();

        //noinspection IOResourceOpenedButNotSafelyClosed
        br = new BufferedReader(new FileReader(xmlList));

        // read the file list to get all the files to load
        while (br.ready()) {
            final String[] in = br.readLine().split("\\t");
            String xmlfile = in[0];
            String archive = in[1];

            if (xmlfile.contains("_clinical")) {
                clinicalFiles.put(xmlfile, archive);
            } else if (xmlfile.contains("_biospecimen")) {
                biospecimenFiles.put(xmlfile, archive);
            } else {
                fullFiles.put(xmlfile, archive);
            }
        }

        Date dateAdded = Calendar.getInstance().getTime();

        // NOTE!!! This deletes all data before the load starts, assuming we are re-loading everything.
        // a better way would be to figure out what has changed and load that, or to be able to load multiple versions of the data in the schema
        emptyClinicalTables(user);

        // load any "full" files first -- in case some archives aren't split yet
        for (final String file : fullFiles.keySet()) {
            String archive = fullFiles.get(file);
            System.out.println("Full file " + file + " in " + archive);
            // need to re-instantiate the disease-specific beans for each file
            createDiseaseSpecificBeans(xmlList);
            String disease = getDiseaseName(archive);
            processFullXmlFile(file, archive, disease, dateAdded);

            // memory leak or something... have to commit and close all connections and re-get connection
            // after each file to keep from using too much heap space.  this troubles me, but I have never had
            // the time to figure out why it happens
            resetConnections(url, user, word);
        }

        // now process all clinical files, and insert patients and clinical data
        for (final String clinicalFile : clinicalFiles.keySet()) {
            createDiseaseSpecificBeans(xmlList);
            String archive = clinicalFiles.get(clinicalFile);
            System.out.println("Clinical file " + clinicalFile + " in " + archive);
            String disease = getDiseaseName(archive);
            processClinicalXmlFile(clinicalFile, archive, disease, dateAdded);
            resetConnections(url, user, word);
        }

        // now process biospecimen files
        for (final String biospecimenFile : biospecimenFiles.keySet()) {
            createDiseaseSpecificBeans(xmlList);
            String archive = biospecimenFiles.get(biospecimenFile);
            String disease = getDiseaseName(archive);
            System.out.println("Biospecimen file " + biospecimenFile);
            processBiospecimenXmlFile(biospecimenFile, archive, disease, dateAdded);
            resetConnections(url, user, word);
        }

        // this sets relationships between these clinical tables and data browser tables, since we delete
        // and reload every time
        setForeignKeys();
        dbConnection.commit();
        dbConnection.close();
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    } finally {
        IOUtils.closeQuietly(br);
    }
}

From source file:cc.wikitools.lucene.IndexWikipediaDump.java

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file")
            .create(INPUT_OPTION));//from  w  w  w.jav  a  2s  .  com
    options.addOption(
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("maximum number of documents to index").create(MAX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads")
            .create(THREADS_OPTION));

    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION))
            : Integer.MAX_VALUE;
    int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION))
            : DEFAULT_NUM_THREADS;

    long startTime = System.currentTimeMillis();

    String path = cmdline.getOptionValue(INPUT_OPTION);
    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build();

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    LOG.info("Creating index at " + indexPath);
    LOG.info("Indexing with " + threads + " threads");

    try {
        WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path);

        ExecutorService executor = Executors.newFixedThreadPool(threads);
        int cnt = 0;
        String page;
        while ((page = stream.readNext()) != null) {
            String title = cleaner.getTitle(page);

            // These are heuristic specifically for filtering out non-articles in enwiki-20120104.
            if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) {
                continue;
            }

            if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) {
                continue;
            }

            Runnable worker = new AddDocumentRunnable(writer, cleaner, page);
            executor.execute(worker);

            cnt++;
            if (cnt % 10000 == 0) {
                LOG.info(cnt + " articles added");
            }
            if (cnt >= maxdocs) {
                break;
            }
        }

        executor.shutdown();
        // Wait until all threads are finish
        while (!executor.isTerminated()) {
        }

        LOG.info("Total of " + cnt + " articles indexed.");

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        out.close();
    }
}

From source file:edu.toronto.cs.xcurator.cli.CLIRunner.java

public static void main(String[] args) {
    Options options = setupOptions();/* ww  w. j  a  v  a  2  s  .c om*/
    CommandLineParser parser = new BasicParser();
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption('t')) {
            fileType = line.getOptionValue('t');
        } else {
            fileType = XML;
        }
        if (line.hasOption('o')) {
            tdbDirectory = line.getOptionValue('o');
            File d = new File(tdbDirectory);
            if (!d.exists() || !d.isDirectory()) {
                throw new Exception("TDB directory does not exist, please create.");
            }
        }
        if (line.hasOption('h')) {
            domain = line.getOptionValue('h');
            try {
                URL url = new URL(domain);
            } catch (MalformedURLException ex) {
                throw new Exception("The domain name is ill-formed");
            }
        } else {
            printHelpAndExit(options);
        }
        if (line.hasOption('m')) {
            serializeMapping = true;
            mappingFilename = line.getOptionValue('m');
        }
        if (line.hasOption('d')) {
            dirLocation = line.getOptionValue('d');
            inputStreams = new ArrayList<>();
            final List<String> files = Util.getFiles(dirLocation);
            for (String inputfile : files) {
                File f = new File(inputfile);
                if (f.isFile() && f.exists()) {
                    System.out.println("Adding document to mapping discoverer: " + inputfile);
                    inputStreams.add(new FileInputStream(f));
                } // If it is a URL download link for the document from SEC
                else if (inputfile.startsWith("http") && inputfile.contains("://")) {
                    // Download
                    System.out.println("Adding remote document to mapping discoverer: " + inputfile);
                    try {
                        URL url = new URL(inputfile);
                        InputStream remoteDocumentStream = url.openStream();
                        inputStreams.add(remoteDocumentStream);
                    } catch (MalformedURLException ex) {
                        throw new Exception("The document URL is ill-formed: " + inputfile);
                    } catch (IOException ex) {
                        throw new Exception("Error in downloading remote document: " + inputfile);
                    }
                } else {
                    throw new Exception("Cannot open XBRL document: " + f.getName());
                }
            }
        }

        if (line.hasOption('f')) {
            fileLocation = line.getOptionValue('f');
            inputStreams = new ArrayList<>();
            File f = new File(fileLocation);
            if (f.isFile() && f.exists()) {
                System.out.println("Adding document to mapping discoverer: " + fileLocation);
                inputStreams.add(new FileInputStream(f));
            } // If it is a URL download link for the document from SEC
            else if (fileLocation.startsWith("http") && fileLocation.contains("://")) {
                // Download
                System.out.println("Adding remote document to mapping discoverer: " + fileLocation);
                try {
                    URL url = new URL(fileLocation);
                    InputStream remoteDocumentStream = url.openStream();
                    inputStreams.add(remoteDocumentStream);
                } catch (MalformedURLException ex) {
                    throw new Exception("The document URL is ill-formed: " + fileLocation);
                } catch (IOException ex) {
                    throw new Exception("Error in downloading remote document: " + fileLocation);
                }
            } else {

                throw new Exception("Cannot open XBRL document: " + f.getName());
            }

        }

        setupDocumentBuilder();
        RdfFactory rdfFactory = new RdfFactory(new RunConfig(domain));
        List<Document> documents = new ArrayList<>();
        for (InputStream inputStream : inputStreams) {
            Document dataDocument = null;
            if (fileType.equals(JSON)) {
                String json = IOUtils.toString(inputStream);
                final String xml = Util.json2xml(json);
                final InputStream xmlInputStream = IOUtils.toInputStream(xml);
                dataDocument = createDocument(xmlInputStream);
            } else {
                dataDocument = createDocument(inputStream);
            }
            documents.add(dataDocument);
        }
        if (serializeMapping) {
            System.out.println("Mapping file will be saved to: " + new File(mappingFilename).getAbsolutePath());
            rdfFactory.createRdfs(documents, tdbDirectory, mappingFilename);
        } else {
            rdfFactory.createRdfs(documents, tdbDirectory);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
        System.err.println("Unexpected exception: " + ex.getMessage());
        System.exit(1);
    }
}

From source file:main.DOORS_Service.java

/**
 * Login to the DWA server and perform some OSLC actions
 * @param args/*ww w .  j a va2s.co  m*/
 * @throws ParseException 
 */
public static void main(String[] args) throws ParseException {

    Options options = new Options();

    options.addOption("url", true, "url");
    options.addOption("user", true, "user ID");
    options.addOption("password", true, "password");
    options.addOption("project", true, "project area");

    CommandLineParser cliParser = new GnuParser();

    //Parse the command line
    CommandLine cmd = cliParser.parse(options, args);

    if (!validateOptions(cmd)) {
        logger.severe(
                "Syntax:  java <class_name> -url https://<server>:port/<context>/ -user <user> -password <password> -project \"<project_area>\"");
        logger.severe(
                "Example: java DoorsOauthSample -url https://exmple.com:9443/dwa -user ADMIN -password ADMIN -project \"JKE Banking (Requirements Management)\"");
        return;
    }

    String webContextUrl = cmd.getOptionValue("url");
    String user = cmd.getOptionValue("user");
    String passwd = cmd.getOptionValue("password");
    String projectArea = cmd.getOptionValue("project");

    try {
        //STEP 1: Initialize a Jazz rootservices helper and indicate we're looking for the RequirementManagement catalog
        // The root services for DOORs is found at /public level
        JazzRootServicesHelper helper = new JazzRootServicesHelper(webContextUrl + "/public",
                OSLCConstants.OSLC_RM);

        //STEP 2: Create a new OSLC OAuth capable client
        OslcOAuthClient client = helper.initOAuthClient("JIRA", "JIRA");

        if (client != null) {
            //STEP 3: Try to access the context URL to trigger the OAuth dance and login
            try {
                client.getResource(webContextUrl, OSLCConstants.CT_RDF);
            } catch (OAuthRedirectException oauthE) {
                validateTokens(client,
                        oauthE.getRedirectURL() + "?oauth_token=" + oauthE.getAccessor().requestToken, user,
                        passwd, webContextUrl + "/j_acegi_security_check");
                // Try to access again
                ClientResponse response = client.getResource(webContextUrl, OSLCConstants.CT_RDF);
                response.getEntity(InputStream.class).close();
            }

            //STEP 4: Get our requirements collection that we want
            //TODO: Replace with option from startup
            String serviceProviderUrl = "http://usnx47:8080/dwa/rm/urn:rational::1-4d2b67b464226e12-M-0000048a";
            ClientResponse response = client.getResource(serviceProviderUrl,
                    "application/x-oslc-rm-requirement-collection-1.0+xml");
            //build the rdf
            Model rdfModel = ModelFactory.createDefaultModel();
            rdfModel.read(response.getEntity(InputStream.class), serviceProviderUrl);
            response.consumeContent();

            //get the statements
            List<Statement> reqs = rdfModel.getResource(serviceProviderUrl).listProperties().toList();
            HashMap<String, String> requirements = new HashMap<String, String>();
            for (Statement s : reqs) {

                String reqURI = s.getObject().toString();
                if (reqURI.contains("http")) {
                    response = client.getResource(reqURI, "application/x-oslc-rm-requirement-1.0+xml");
                    if (response.getStatusCode() == 200) {
                        InputStream in = response.getEntity(InputStream.class);
                        Model model = ModelFactory.createDefaultModel();

                        try {
                            model.read(in, reqURI);
                        } catch (Exception sa) {
                            System.out.println(reqURI);
                        }

                        //Properties to traverse on
                        Property attrDef = model
                                .createProperty("http://jazz.net/doors/xmlns/prod/jazz/doors/1.0/attrDef");
                        Property name = model
                                .createProperty("http://jazz.net/doors/xmlns/prod/jazz/doors/1.0/name");

                        //Flags we use for parsing
                        int count = 0;
                        boolean isText = false;
                        boolean isID = false;
                        boolean done = false;
                        //Text of the DOORS Object and its ID are what we are going to extract
                        String text = "";
                        String id = "";

                        //Look through all of the possible fields
                        StmtIterator statementIter = model.listStatements();
                        while (statementIter.hasNext() && done != true) {
                            Statement field = statementIter.next();
                            //Get the attrDef property to find out what kind of value we have
                            StmtIterator props = field.getSubject().listProperties(attrDef);
                            while (props.hasNext() && done != true) {
                                Statement kind = props.next();
                                RDFNode propertyNode = kind.getObject();
                                StmtIterator propIt = propertyNode.asResource().listProperties(name);
                                //Check all of the properties for our desired fields
                                while (propIt.hasNext()) {
                                    Statement node = propIt.next();
                                    if (node.getObject().isLiteral()) {
                                        if (node.getObject().toString().contains("Object+Text")
                                                && field.getObject().isLiteral()) {
                                            text = field.getLiteral().toString();
                                            text = text.substring(0, text.indexOf("^"));
                                            count++;

                                        }
                                        if (node.getObject().toString().contains("Absolute+Number")
                                                && field.getObject().isLiteral()) {
                                            id = field.getLiteral().toString();
                                            id = id.substring(0, id.indexOf("^"));
                                            count++;
                                        }

                                    }
                                }
                                if (count == 2) {
                                    if (!text.isEmpty()) {
                                        //System.out.println( "Req: " + id );
                                        //System.out.println( text );
                                        requirements.put(id, text);
                                        count = 0;
                                        done = true;
                                        break;
                                    }

                                }
                            }

                        }

                    }

                }
                response.consumeContent();
            }
            //check if already in JIRA
            //post to jira
            for (Entry<String, String> e : requirements.entrySet()) {

            }
        }

    } catch (Exception e) {
        logger.log(Level.SEVERE, e.getMessage(), e);
    }

}

From source file:edu.nyu.vida.data_polygamy.pre_processing.PreProcessing.java

/**
 * @param args/*from  w w w  . j a v a 2  s  .c o  m*/
 * @throws IOException 
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option nameOption = new Option("dn", "name", true, "the name of the dataset");
    nameOption.setRequired(true);
    nameOption.setArgName("DATASET NAME");
    options.addOption(nameOption);

    Option headerOption = new Option("dh", "header", true, "the file that contains the header of the dataset");
    headerOption.setRequired(true);
    headerOption.setArgName("DATASET HEADER FILE");
    options.addOption(headerOption);

    Option deafultsOption = new Option("dd", "defaults", true,
            "the file that contains the default values of the dataset");
    deafultsOption.setRequired(true);
    deafultsOption.setArgName("DATASET DEFAULTS FILE");
    options.addOption(deafultsOption);

    Option tempResOption = new Option("t", "temporal", true,
            "desired temporal resolution (hour, day, week, or month)");
    tempResOption.setRequired(true);
    tempResOption.setArgName("TEMPORAL RESOLUTION");
    options.addOption(tempResOption);

    Option spatialResOption = new Option("s", "spatial", true,
            "desired spatial resolution (points, nbhd, zip, grid, or city)");
    spatialResOption.setRequired(true);
    spatialResOption.setArgName("SPATIAL RESOLUTION");
    options.addOption(spatialResOption);

    Option currentSpatialResOption = new Option("cs", "current-spatial", true,
            "current spatial resolution (points, nbhd, zip, grid, or city)");
    currentSpatialResOption.setRequired(true);
    currentSpatialResOption.setArgName("CURRENT SPATIAL RESOLUTION");
    options.addOption(currentSpatialResOption);

    Option indexResOption = new Option("i", "index", true, "indexes of the temporal and spatial attributes");
    indexResOption.setRequired(true);
    indexResOption.setArgName("INDEX OF SPATIO-TEMPORAL RESOLUTIONS");
    indexResOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(indexResOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);
    String dataset = cmd.getOptionValue("dn");
    String header = cmd.getOptionValue("dh");
    String defaults = cmd.getOptionValue("dd");
    String temporalResolution = cmd.getOptionValue("t");
    String spatialResolution = cmd.getOptionValue("s");
    String gridResolution = "";
    String currentSpatialResolution = cmd.getOptionValue("cs");

    if (spatialResolution.contains("grid")) {
        String[] res = spatialResolution.split("-");
        spatialResolution = res[0];
        gridResolution = res[1];
    }

    conf.set("header", s3bucket + FrameworkUtils.dataDir + "/" + header);
    conf.set("defaults", s3bucket + FrameworkUtils.dataDir + "/" + defaults);
    conf.set("temporal-resolution", temporalResolution);
    conf.set("spatial-resolution", spatialResolution);
    conf.set("grid-resolution", gridResolution);
    conf.set("current-spatial-resolution", currentSpatialResolution);

    String[] indexes = cmd.getOptionValues("i");
    String temporalPos = "";
    Integer sizeSpatioTemp = 0;
    if (!(currentSpatialResolution.equals("points"))) {
        String spatialPos = "";
        for (int i = 0; i < indexes.length; i++) {
            temporalPos += indexes[i] + ",";
            spatialPos += indexes[++i] + ",";
            sizeSpatioTemp++;
        }
        conf.set("spatial-pos", spatialPos);
    } else {
        String xPositions = "", yPositions = "";
        for (int i = 0; i < indexes.length; i++) {
            temporalPos += indexes[i] + ",";
            xPositions += indexes[++i] + ",";
            yPositions += indexes[++i] + ",";
            sizeSpatioTemp++;
        }
        conf.set("xPositions", xPositions);
        conf.set("yPositions", yPositions);
    }
    conf.set("temporal-pos", temporalPos);

    conf.set("size-spatio-temporal", sizeSpatioTemp.toString());

    // checking resolutions

    if (utils.spatialResolution(spatialResolution) < 0) {
        System.out.println("Invalid spatial resolution: " + spatialResolution);
        System.exit(-1);
    }

    if (utils.spatialResolution(spatialResolution) == FrameworkUtils.POINTS) {
        System.out.println("The data needs to be reduced at least to neighborhoods or grid.");
        System.exit(-1);
    }

    if (utils.spatialResolution(currentSpatialResolution) < 0) {
        System.out.println("Invalid spatial resolution: " + currentSpatialResolution);
        System.exit(-1);
    }

    if (utils.spatialResolution(currentSpatialResolution) > utils.spatialResolution(spatialResolution)) {
        System.out.println("The current spatial resolution is coarser than "
                + "the desired one. You can only navigate from a fine resolution" + " to a coarser one.");
        System.exit(-1);
    }

    if (utils.temporalResolution(temporalResolution) < 0) {
        System.out.println("Invalid temporal resolution: " + temporalResolution);
        System.exit(-1);
    }

    String fileName = s3bucket + FrameworkUtils.preProcessingDir + "/" + dataset + "-" + temporalResolution
            + "-" + spatialResolution + gridResolution;
    conf.set("aggregates", fileName + ".aggregates");

    // making sure both files are removed, if they exist
    FrameworkUtils.removeFile(fileName, s3conf, s3);
    FrameworkUtils.removeFile(fileName + ".aggregates", s3conf, s3);

    /**
     * Hadoop Parameters
     * sources: http://www.slideshare.net/ImpetusInfo/ppt-on-advanced-hadoop-tuning-n-optimisation
     *          https://cloudcelebrity.wordpress.com/2013/08/14/12-key-steps-to-keep-your-hadoop-cluster-running-strong-and-performing-optimum/
     */

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");

    // using SnappyCodec for intermediate and output data ?
    // TODO: for now, using SnappyCodec -- what about LZO + Protocol Buffer serialization?
    //   LZO - http://www.oberhumer.com/opensource/lzo/#download
    //   Hadoop-LZO - https://github.com/twitter/hadoop-lzo
    //   Protocol Buffer - https://github.com/twitter/elephant-bird
    //   General Info - http://www.devx.com/Java/Article/47913
    //   Compression - http://comphadoop.weebly.com/index.html
    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        conf.set("mapreduce.output.fileoutputformat.compress.codec",
                "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        conf.set("mapreduce.output.fileoutputformat.compress.codec",
                "org.apache.hadoop.io.compress.BZip2Codec");
    }

    // TODO: this is dangerous!
    if (s3) {
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    Job job = new Job(conf);
    job.setJobName(dataset + "-" + temporalResolution + "-" + spatialResolution);

    job.setMapOutputKeyClass(MultipleSpatioTemporalWritable.class);
    job.setMapOutputValueClass(AggregationArrayWritable.class);

    job.setOutputKeyClass(MultipleSpatioTemporalWritable.class);
    job.setOutputValueClass(AggregationArrayWritable.class);

    job.setMapperClass(PreProcessingMapper.class);
    job.setCombinerClass(PreProcessingCombiner.class);
    job.setReducerClass(PreProcessingReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());
    //job.setNumReduceTasks(1);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    FileInputFormat.setInputPaths(job, new Path(s3bucket + FrameworkUtils.dataDir + "/" + dataset));
    FileOutputFormat.setOutputPath(job, new Path(fileName));

    job.setJarByClass(PreProcessing.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(fileName + "\t" + (System.currentTimeMillis() - start));

}

From source file:com.thesmartweb.vivliocrawlermaven.VivlioCrawlerMavenMain.java

/**
 * @param args the command line arguments
 *//*w  w w .  j ava 2  s  . c o m*/
public static void main(String[] args) {
    // TODO code application logic here
    try {

        OaiPmhServer server = new OaiPmhServer("http://vivliothmmy.ee.auth.gr/cgi/oai2");
        RecordsList listRecords = server.listRecords("oai_dc");//we capture all the records in oai dc format
        List<VivlioCrawlerMavenMain> listtotal = new ArrayList<VivlioCrawlerMavenMain>();
        //we capture all the names of the professors and former professor of ECE of AUTH from a txt file
        //change the directory to yours
        List<String> profs = Files.readAllLines(Paths.get(
                "/home/themis/NetBeansProjects/VivlioCrawlerMaven/src/main/java/com/thesmartweb/vivliocrawlermaven/profs.txt"));

        boolean more = true;//it is a flag used if we encounter more entries than the initial capture
        JSONArray array = new JSONArray();//it is going to be our final total json array
        JSONObject jsonObject = new JSONObject();//it is going to be our final total json object
        while (more) {
            for (Record rec : listRecords.asList()) {
                VivlioCrawlerMavenMain vc = new VivlioCrawlerMavenMain();
                Element metadata = rec.getMetadata();
                if (metadata != null) {
                    //System.out.println(rec.getMetadataAsString());
                    List<Element> elements = metadata.elements();
                    //System.out.println(metadata.getStringValue());
                    for (Element element : elements) {
                        String name = element.getName();
                        //we get the title, remove \r, \n and beginning and trailing whitespace
                        if (name.equalsIgnoreCase("title")) {
                            vc.title = element.getStringValue();
                            vc.title = vc.title.trim();
                            vc.title = vc.title.replaceAll("(\\r|\\n)", "");
                            if (!(vc.title.endsWith("."))) {
                                vc.title = vc.title + ".";//we also add dot in the end for the titles to be uniformed
                            }
                        }
                        if (name.equalsIgnoreCase("creator")) {
                            vc.creators.add(element.getStringValue());//we capture the students' names
                        }
                        if (name.equalsIgnoreCase("subject")) {
                            vc.subjects.add(element.getStringValue());//we capture the subjects
                        }
                        if (name.equalsIgnoreCase("description")) {
                            vc.description = element.getStringValue();//we capture the abstract
                        }
                        if (name.equalsIgnoreCase("date")) {
                            vc.datestring = element.getStringValue();
                        }
                        if (name.equalsIgnoreCase("identifier")) {
                            if (element.getStringValue().contains("http://")) {
                                vc.thesisFiles.add(element.getStringValue());//we capture the url of the thesis whole file
                                if (vc.thesisURL == null) {
                                    vc.thesisURL = element.getStringValue().substring(0, 32);
                                }
                            }
                            //if the identifier contains the title then it must be the citation 
                            //out of the citation we need to extract the supevisor's name
                            if (element.getStringValue().contains(vc.title.substring(0, 10))) {
                                vc.citation = element.getStringValue();
                                vc.supervisor = element.getStringValue();
                                Iterator profsIterator = profs.iterator();
                                vc.supervisor = vc.supervisor.replace(vc.title, "");//we remove the title out of the citation
                                //if we have two students we remove the first occurence of "" which stands for "and"
                                if (vc.creators.size() == 2) {
                                    vc.supervisor = vc.supervisor.replaceFirst("", "");
                                }
                                //we remove the students' names
                                Iterator creatorsIterator = vc.creators.iterator();
                                while (creatorsIterator.hasNext()) {
                                    vc.supervisor = vc.supervisor.replace(creatorsIterator.next().toString(),
                                            "");
                                }
                                boolean profFlag = false;//flag used that declares that we found the professor that was supervisor
                                while (profsIterator.hasNext() && !profFlag) {
                                    String prof = profsIterator.next().toString();
                                    //we split the professor's name to surname and name
                                    //because some entries have first the surname and others first the name
                                    String[] profSplitted = prof.split("\\s+");
                                    String supervisorCleared = vc.supervisor;
                                    supervisorCleared = supervisorCleared.replaceAll("\\s+", "");//we clear the white space
                                    supervisorCleared = supervisorCleared.replaceAll("(\\r|\\n)", "");//we remove the \r\n
                                    //now we check if the citation includes any name of the professors from the txt
                                    if (supervisorCleared.contains(profSplitted[0])
                                            && supervisorCleared.contains(profSplitted[1])) {
                                        vc.supervisor = prof;
                                        profFlag = true;
                                    }
                                }
                                //if we don't find the name of the supervisor, we have to perform string manipulation to extract it
                                if (!profFlag) {
                                    vc.supervisor = vc.supervisor.trim();
                                    //we remove the word "" which stands for "Thessaloniki" and "" which stands for Greece
                                    if (vc.supervisor.contains("")) {
                                        vc.supervisor = vc.supervisor.replaceFirst("",
                                                "");
                                    }
                                    if (vc.supervisor.contains("")) {
                                        vc.supervisor = vc.supervisor.replaceFirst("",
                                                "");
                                    }
                                    if (vc.supervisor.contains("")) {
                                        vc.supervisor = vc.supervisor.replaceFirst("", "");
                                    }
                                    if (vc.supervisor.contains("")) {
                                        vc.supervisor = vc.supervisor.replaceFirst("", "");
                                    }
                                    //we remove the year and then we should be left only with the supervisor's name
                                    vc.supervisor = vc.supervisor.replace("(", "");
                                    vc.supervisor = vc.supervisor.trim();
                                    vc.supervisor = vc.supervisor.replace(")", "");
                                    vc.supervisor = vc.supervisor.trim();
                                    vc.supervisor = vc.supervisor.replace(",", "");
                                    vc.supervisor = vc.supervisor.trim();
                                    vc.supervisor = vc.supervisor.replace(".", "");
                                    vc.supervisor = vc.supervisor.trim();
                                    vc.supervisor = vc.supervisor.replace(vc.datestring.substring(0, 4), "");
                                    vc.supervisor = vc.supervisor.trim();
                                }
                                //we put everything in a json object
                                JSONObject obj = new JSONObject();
                                obj.put("title", vc.title);
                                obj.put("description", vc.description);
                                JSONArray creatorsArray = new JSONArray();
                                creatorsArray.add(vc.creators);
                                obj.put("creators", creatorsArray);
                                JSONArray subjectsArray = new JSONArray();
                                List<String> subjectsList = new ArrayList<String>(vc.subjects);
                                subjectsArray.add(subjectsList);
                                obj.put("subjects", subjectsArray);
                                obj.put("datestring", vc.datestring);
                                JSONArray thesisFilesArray = new JSONArray();
                                thesisFilesArray.add(vc.thesisFiles);
                                obj.put("thesisFiles", thesisFilesArray);
                                obj.put("thesisURL", vc.thesisURL);
                                obj.put("supervisor", vc.supervisor);
                                obj.put("citation", vc.citation);
                                //if you are using JSON.simple do this
                                array.add(obj);
                            }
                        }
                    }
                    listtotal.add(vc);//a list containing all the objects
                    //it is not used for now, but created for potential extension of the work
                }
            }
            //the following if clause searches for new records
            if (listRecords.getResumptionToken() != null) {
                listRecords = server.listRecords(listRecords.getResumptionToken());
            } else {
                more = false;
            }
        }
        //we print which records did not have a supervisor
        for (VivlioCrawlerMavenMain vctest : listtotal) {

            if (vctest.supervisor == null) {
                System.out.println(vctest.title);
                System.out.println(vctest.citation);
            }
        }
        //we create a pretty json with GSON and we write it into a file
        jsonObject.put("VivliothmmyOldArray", array);
        JsonParser parser = new JsonParser();
        JsonObject json = parser.parse(jsonObject.toJSONString()).getAsJsonObject();
        Gson gson = new GsonBuilder().setPrettyPrinting().create();
        String prettyJson = gson.toJson(json);
        try {

            FileWriter file = new FileWriter(
                    "/home/themis/NetBeansProjects/VivlioCrawlerMaven/src/main/java/com/thesmartweb/vivliocrawlermaven/VivliothmmyOldRecords.json");
            file.write(prettyJson);
            file.flush();
            file.close();

        } catch (IOException e) {
            System.out.println("Exception: " + e);
        }

        //System.out.print(prettyJson);

        //int j=0;

    } catch (OAIException | IOException e) {
        System.out.println("Exception: " + e);
    }
}

From source file:bear.core.BearMain.java

/**
 * -VbearMain.appConfigDir=src/main/groovy/examples -VbearMain.buildDir=.bear/classes -VbearMain.script=dumpSampleGrid -VbearMain.projectClass=SecureSocialDemoProject -VbearMain.propertiesFile=.bear/test.properties
 *//*from   ww w.j  a v  a2  s . c  om*/
public static void main(String[] args) throws Exception {
    int i = ArrayUtils.indexOf(args, "--log-level");

    if (i != -1) {
        LoggingBooter.changeLogLevel(LogManager.ROOT_LOGGER_NAME, Level.toLevel(args[i + 1]));
    }

    i = ArrayUtils.indexOf(args, "-q");

    if (i != -1) {
        LoggingBooter.changeLogLevel(LogManager.ROOT_LOGGER_NAME, Level.WARN);
    }

    GlobalContext global = GlobalContext.getInstance();

    BearMain bearMain = null;

    try {
        bearMain = new BearMain(global, getCompilerManager(), args);
    } catch (Exception e) {
        if (e.getClass().getSimpleName().equals("MissingRequiredOptionException")) {
            System.out.println(e.getMessage());
        } else {
            Throwables.getRootCause(e).printStackTrace();
        }

        System.exit(-1);
    }

    if (bearMain.checkHelpAndVersion()) {
        return;
    }

    AppOptions2 options2 = bearMain.options;

    if (options2.has(AppOptions2.UNPACK_DEMOS)) {
        String filesAsText = ProjectGenerator.readResource("/demoFiles.txt");

        int count = 0;

        for (String resource : filesAsText.split("::")) {
            File dest = new File(BEAR_DIR + resource);
            System.out.printf("copying %s to %s...%n", resource, dest);

            writeStringToFile(dest, ProjectGenerator.readResource(resource));

            count++;
        }

        System.out.printf("extracted %d files%n", count);

        return;
    }

    if (options2.has(AppOptions2.CREATE_NEW)) {
        String dashedTitle = options2.get(AppOptions2.CREATE_NEW);

        String user = options2.get(AppOptions2.USER);
        String pass = options2.get(AppOptions2.PASSWORD);

        List<String> hosts = options2.getList(AppOptions2.HOSTS);

        List<String> template;

        if (options2.has(AppOptions2.TEMPLATE)) {
            template = options2.getList(AppOptions2.TEMPLATE);
        } else {
            template = emptyList();
        }

        ProjectGenerator g = new ProjectGenerator(dashedTitle, user, pass, hosts, template);

        if (options2.has(AppOptions2.ORACLE_USER)) {
            g.oracleUser = options2.get(AppOptions2.ORACLE_USER);
        }

        if (options2.has(AppOptions2.ORACLE_PASSWORD)) {
            g.oraclePassword = options2.get(AppOptions2.ORACLE_PASSWORD);
        }

        File projectFile = new File(BEAR_DIR, g.getProjectTitle() + ".groovy");
        File pomFile = new File(BEAR_DIR, "pom.xml");

        writeStringToFile(projectFile, g.processTemplate("TemplateProject.template"));

        writeStringToFile(new File(BEAR_DIR, dashedTitle + ".properties"),
                g.processTemplate("project-properties.template"));
        writeStringToFile(new File(BEAR_DIR, "demos.properties"),
                g.processTemplate("project-properties.template"));
        writeStringToFile(new File(BEAR_DIR, "bear-fx.properties"),
                g.processTemplate("bear-fx.properties.template"));

        writeStringToFile(pomFile, g.generatePom(dashedTitle));

        System.out.printf("Created project file: %s%n", projectFile.getPath());
        System.out.printf("Created maven pom: %s%n", pomFile.getPath());

        System.out.println("\nProject files have been created. You may now: " + "\n a) Run `bear "
                + g.getShortName() + ".ls` to quick-test your minimal setup"
                + "\n b) Import the project to IDE or run smoke tests, find more details at the project wiki: https://github.com/chaschev/bear/wiki/.");

        return;
    }

    Bear bear = global.bear;

    if (options2.has(AppOptions2.QUIET)) {
        global.put(bear.quiet, true);
        LoggingBooter.changeLogLevel(LogManager.ROOT_LOGGER_NAME, Level.WARN);
    }

    if (options2.has(AppOptions2.USE_UI)) {
        global.put(bear.useUI, true);
    }

    if (options2.has(AppOptions2.NO_UI)) {
        global.put(bear.useUI, false);
    }

    List<?> list = options2.getOptionSet().nonOptionArguments();

    if (list.size() > 1) {
        throw new IllegalArgumentException("too many arguments: " + list + ", "
                + "please specify an invoke line, project.method(arg1, arg2)");
    }

    if (list.isEmpty()) {
        throw new UnsupportedOperationException("todo implement running a single project");
    }

    String invokeLine = (String) list.get(0);

    String projectName;
    String method;

    if (invokeLine.contains(".")) {
        projectName = StringUtils.substringBefore(invokeLine, ".");
        method = StringUtils.substringAfter(invokeLine, ".");
    } else {
        projectName = invokeLine;
        method = null;
    }

    if (method == null || method.isEmpty())
        method = "deploy()";
    if (!method.contains("("))
        method += "()";

    Optional<CompiledEntry<? extends BearProject>> optional = bearMain.compileManager.findProject(projectName);

    if (!optional.isPresent()) {
        throw new IllegalArgumentException("project was not found: " + projectName + ", loaded classes: \n"
                + Joiner.on("\n").join(bearMain.compileManager.findProjects()) + ", searched in: "
                + bearMain.compileManager.getSourceDirs() + ", ");
    }

    BearProject project = OpenBean.newInstance(optional.get().aClass).injectMain(bearMain);

    GroovyShell shell = new GroovyShell();

    shell.setVariable("project", project);
    shell.evaluate("project." + method);
}