Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tallison.cc; import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.digest.DigestUtils; import java.io.BufferedReader; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; /** * wrapper around wget to run it multi-threaded/process and output * the file by mime name */ public class WReGetter { private final Base32 base32 = new Base32(); static AtomicInteger WGET_COUNTER = new AtomicInteger(0); private Path rootDir; public static void main(String[] args) throws Exception { WReGetter getter = new WReGetter(); getter.execute(args); } private static void usage() { System.out.println( "java -jar *.jar org.mitre.commoncrawl.WReGetter <numThreads> <digest_url_file> <outputdir>"); System.out.println("The <digest_url_file> is a tab-delimited UTF-8 file with no escaped tabs"); System.out.println("It has two columns: digest\\turl"); } private void execute(String[] args) throws IOException { if (args.length != 3) { usage(); System.exit(1); } if (args[0].contains("-h")) { usage(); System.exit(0); } int numThreads = Integer.parseInt(args[0]); BufferedReader r = Files.newBufferedReader(Paths.get(args[1])); ArrayBlockingQueue<DigestURLPair> queue = new ArrayBlockingQueue<DigestURLPair>(1000); QueueFiller filler = new QueueFiller(r, queue, numThreads); new Thread(filler).start(); rootDir = Paths.get(args[2]); System.out.println("creating thread pool"); ExecutorService executorService = Executors.newFixedThreadPool(numThreads); ExecutorCompletionService<Integer> executorCompletionService = new ExecutorCompletionService<Integer>( executorService); System.out.println("about to start"); for (int i = 0; i < numThreads; i++) { System.out.println("submitted " + i); executorCompletionService.submit(new WGetter(queue)); } int completed = 0; while (completed < numThreads) { try { Future<Integer> future = executorCompletionService.poll(1, TimeUnit.SECONDS); if (future != null) { completed++; } } catch (InterruptedException e) { } } executorService.shutdown(); executorService.shutdownNow(); System.exit(0); } private class QueueFiller implements Runnable { private final BufferedReader reader; private final ArrayBlockingQueue queue; private final int numThreads; private QueueFiller(BufferedReader reader, ArrayBlockingQueue<DigestURLPair> q, int numThreads) { this.reader = reader; this.queue = q; this.numThreads = numThreads; } @Override public void run() { try { String line = reader.readLine(); while (line != null) { String[] cols = line.split("\t"); String digest = null; String url = null; if (cols.length == 1) { url = cols[0]; } else { digest = cols[0]; url = cols[1]; } DigestURLPair p = new DigestURLPair(digest, url); //hang forever queue.put(p); /*boolean added = false; while (added == false) { added = queue.offer(new DigestURLPair(digest, url), 1, TimeUnit.SECONDS); }*/ line = reader.readLine(); } } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } System.out.println("queue filler has finished"); for (int i = 0; i < numThreads; i++) { try { queue.put(new DigestURLPairPoison()); } catch (InterruptedException e) { e.printStackTrace(); } } System.out.println("queue filler has finished adding poison"); return; } } private class WGetter implements Callable<Integer> { int id = WGET_COUNTER.getAndIncrement(); final ArrayBlockingQueue<DigestURLPair> queue; WGetter(ArrayBlockingQueue<DigestURLPair> q) { this.queue = q; System.out.println("WGETTER STARTED"); } @Override public Integer call() throws Exception { while (true) { try { DigestURLPair p = queue.poll(1, TimeUnit.SECONDS); System.out.println("WGOT: " + id + " : " + p.url); if (p instanceof DigestURLPairPoison) { return 1; } wget(p); } catch (InterruptedException e) { e.printStackTrace(); } } } private void wget(DigestURLPair p) throws IOException { System.out.println(id + " going to get 1 " + p.url); ProcessBuilder pb = new ProcessBuilder(); pb.inheritIO(); String digest = p.digest; Path targetPath = null; boolean needToComputeDigest = false; if (digest != null) { targetPath = rootDir.resolve(p.digest.substring(0, 2) + "/" + p.digest); if (Files.isRegularFile(targetPath)) { return; } Files.createDirectories(targetPath.getParent()); } else { targetPath = Files.createTempFile("wgetter", "tmp"); needToComputeDigest = true; } System.out.println(id + " going to get " + p.url); String[] args = new String[] { "wget", "-t", "1", //just try once "-O", targetPath.toString(), p.url }; pb.command(args); Process process = pb.start(); int exit = -1; System.out.println(id + " about to start: " + p.digest + " : " + p.url); while (true) { try { exit = process.exitValue(); break; } catch (IllegalThreadStateException e) { try { Thread.sleep(500); } catch (InterruptedException e2) { } } } if (needToComputeDigest) { digest = base32.encodeToString(DigestUtils.sha1(Files.newInputStream(targetPath))); System.out.println(id + " digest: " + digest); Path repoTargetFile = rootDir.resolve(digest.substring(0, 2) + "/" + digest); if (Files.exists(repoTargetFile)) { Files.delete(targetPath); System.out.println("Already had file: " + digest); return; } Files.createDirectories(repoTargetFile.getParent()); Files.copy(targetPath, repoTargetFile); Files.delete(targetPath); } System.out.println(id + " finished: " + digest + " : " + p.url); } } private class DigestURLPair { final String digest; final String url; DigestURLPair(String digest, String url) { this.digest = digest; this.url = url; } } private class DigestURLPairPoison extends DigestURLPair { DigestURLPairPoison() { super(null, null); } } }