Java tutorial
/** * SPDXVersion: SPDX-1.1 * Creator: Person: Nuno Brito (nuno.brito@triplecheck.de) * Creator: Organization: TripleCheck (contact@triplecheck.de) * Created: 2014-07-13T13:36:54Z * LicenseName: EUPL-1.1-without-appendix * FileName: RepDownload.java * FileType: SOURCE * FileCopyrightText: <text> Copyright 2014 Nuno Brito, TripleCheck </text> * FileComment: <text> * * This is the class that will take a text file containing a list of * repositories on github to then add these respective source code files inside * each repository on a big archive. * * The first step is getting the list of repositories, which initially will be * scoped to a specific programming language. The second step is getting the * files from the repository onto a folder on disk. From here we add them up * to the big archive, mark the repository as processed and move forward to the * next one on our list. * </text> */ package actions; import big.BigZip; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.concurrent.ConcurrentNavigableMap; import java.util.logging.Level; import java.util.logging.Logger; import main.Repositories; import org.apache.commons.io.FileDeleteStrategy; import org.eclipse.jgit.api.Git; import org.eclipse.jgit.lib.Repository; import org.eclipse.jgit.storage.file.FileRepositoryBuilder; import org.mapdb.DB; import org.mapdb.DBMaker; import utils_deprecated.files; /** * * @author Nuno Brito, 12th of July 2014 in Darmstadt, Germany */ public class RepDownload { // where we store the logs of data being processed private File fileLog; private BufferedReader reader; private FileReader fileReader; // where we test placing all the files final String testArchive = "./storage/test.big"; final File fileArchive = new File(testArchive), fileExceptionHappened = new File("./failed.txt"), folderDownload = new File("./download/"); final BigZip bigArchive = new BigZip(fileArchive); // repositories downloaded on the current session private long repCounter = 0; // what kind of file type are we looking for? private String wantedFileType = ".java"; // the database where we store the ongoing-downloads DB dbQueue; // A simple way to allocate working threads private final boolean[] slots = new boolean[10]; /** * Start the processing of repositories and respective files * @param args */ public void start(String args[]) { // pre-flight check if (args.length < 3) { System.out.println("DR45 - Error, not enough parameters!"); return; } // get the repository that we want to process final String repository = args[1]; // define the type of file that we want to index wantedFileType = args[2]; System.out.println("Selected file type: " + wantedFileType); // open this file File fileRepository = new File(repository); // check our log file checkLogFile(fileRepository.getName()); // open up our repository file for reading startOperation(fileRepository); // initialize the queue database initializeQueueDB(); // now read and process each repository line readLines(); // all done concludeOperation(); } /** * Gets the persistent storage working. This is necessary for keeping track * of what is currently being processed. If for some reason processing is * interrupted abruptly, on the next restart we can proceed from the last * saved point. */ private void initializeQueueDB() { // assign the folder for our queue database File folder = new File(".", "queue"); // create the folder utils_deprecated.files.mkdirs(folder); // get the database started dbQueue = DBMaker.newFileDB(new File(folder, "queue.db")).closeOnJvmShutdown().make(); System.out.println("Queue database initialized: " + folder.getAbsolutePath()); // now check for repositories on the queue that we are missing to launch ConcurrentNavigableMap<String, Long> map = dbQueue.getTreeMap("queue"); boolean hasQueue = false; for (String repository : map.descendingKeySet()) { hasQueue = true; System.out.println(repository); // Wait for a free slot while (hasFreeSlots() == false) { utils_deprecated.time.wait(5); } // launch the repository download System.out.println("Resuming: " + repository); launchNewDownload(repository, "github.com"); queueRemove(repository); } } /** * Adds a repository to our queue * @param repository A repository line as extracted from the text file */ private void queueAdd(final String repository) { // this is a good moment to save a recovery point right here ConcurrentNavigableMap<String, Long> map = dbQueue.getTreeMap("queue"); // put on the database, if it is already there then it gets overwritten map.put(repository, System.currentTimeMillis()); dbQueue.commit(); } /** * Removes a repository from our queue * @param repository A repository line as extracted from the text file */ private void queueRemove(final String repository) { // this is a good moment to save a recovery point right here ConcurrentNavigableMap<String, Long> map = dbQueue.getTreeMap("queue"); // put on the database, if it is already there then it gets overwritten map.remove(repository); dbQueue.commit(); } /** * All repositories that were processed get written on a log file * */ private void checkLogFile(final String logId) { fileLog = new File(logId + "-log.txt"); // the file doesn't exist, let's create one then if (fileLog.exists() == false) { // create a fresh log file utils_deprecated.files.touch(fileLog); } // if the file doesn't still exist, then something wrong happened, exit if (fileLog.exists() == false) { System.out.println("DR73 - Unable to create a log file: " + fileLog.getAbsolutePath()); System.exit(-1); } } /** * Open the repository file for starting to read lines */ private void startOperation(final File fileRepository) { // initialize the slots int i = 0; for (boolean slot : slots) { slots[i] = false; i++; } // open the repository file stream try { // open the file streams fileReader = new FileReader(fileRepository); reader = new BufferedReader(fileReader); // avoid the header line reader.readLine(); } catch (FileNotFoundException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); System.out.println("DR104 - Failed to open streams"); System.exit(-1); } catch (IOException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } // the file is initialized, now get the last line processed (if any) final String lastRepository = utils_deprecated.files.getLastLine(fileLog); // at this point we want to resume the operation (if necessary) if (lastRepository.isEmpty() == false) { System.out.println("Resuming the indexing after " + lastRepository); utils_deprecated.time.wait(3); moveToLastLine(lastRepository); } } /** * Moves the buffered reader pointer to the next line after the last one * that was processed and logged with success. * @param reader The reader object */ private void moveToLastLine(final String lastRep) { // let's move to the last repository indexed String line; try { // read through each line while ((line = reader.readLine()) != null) { if (line.startsWith(lastRep)) { // we found a match break; } } // we can't have a null value, this means that the rep doesn't exist if (line == null) { System.out.println("DR - Error, last mentioned repository " + "doesn't exist: " + lastRep); System.exit(-1); } } catch (IOException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } } /** * Close the open files */ private void concludeOperation() { // check if our reader object is still open if (reader != null) { try { // close it up reader.close(); } catch (IOException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } } // now close the file reader if (fileReader != null) { try { fileReader.close(); } catch (IOException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } } } /** * Read each line on the repository file and process the files accordingly */ private void readLines() { String repository; try { // go through each line until null while ((repository = reader.readLine()) != null) { // Wait for a free slot while (hasFreeSlots() == false) { utils_deprecated.time.wait(5); } //getRepository(repository, "01", "github.com"); queueAdd(repository); launchNewDownload(repository, "github.com"); queueRemove(repository); } } catch (IOException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } } /** * Check all the slots being used right now. * * @return if one of them is free them return the value as true. * Otherwise, returns false */ private boolean hasFreeSlots() { for (Boolean slot : slots) { if (slot == false) { return true; } } return false; } /** * Launches the new download of a repository */ private void launchNewDownload(final String repositoryDetails, final String specificFolder) { // initialize the counter int i = 0; // now iterate each slot for (Boolean slot : slots) { // once we get a slot that is empty, launch the thread if (slot == false) { // mark the slot as used slots[i] = true; // now get the files using a thread final int id = i; Thread thread = new Thread() { @Override public void run() { getRepository(repositoryDetails, id, specificFolder); } }; thread.start(); return; } // go the next slot i++; } // no free slots? This shouldn't happen, signal a serious error System.out.println("No empty slots, operation failed!"); System.exit(-1); } /** * Downloads a repository from the Internet, adds it up to the big archive * and then moves forward to the next one * @param repositoryDetails A repository line from our text file */ private void getRepository(final String repositoryDetails, final int slotId, final String specificFolder) { // there always needs to be a space, otherwise exit the processing int i1 = repositoryDetails.indexOf(" "); if (i1 == -1) { System.out.println("DR195 - Error, unable to process: " + repositoryDetails); System.exit(-1); } // now get the repository name final String repository = repositoryDetails.substring(0, i1); System.out.println(slotId + "-> Downloading: " + repository); // Create the path where the files will be downloaded onto File localPath = new File(folderDownload + "/" + slotId + "/" + specificFolder, repository); utils_deprecated.files.mkdirs(localPath); // do the download of files boolean success = download(slotId, localPath, repository); //svnExport(repository, localPath); // did we had success in downloading the files? if (success == false) { // move to the next one on the list return; } // delete the files that are not needed files.deleteHiddenFilesAndFolders(localPath, 25); deleteUnwantedFiles(localPath, 25); // now archive these files writeBigFiles(localPath, slotId); // now delete these files File localDelete; try { localDelete = new File(folderDownload.getCanonicalPath(), slotId + ""); files.deleteHiddenFilesAndFolders(localDelete, 25); FileDeleteStrategy.FORCE.deleteQuietly(localDelete); } catch (IOException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } // all done with success, write this name on our log of actions utils_deprecated.files.addTextToFile(fileLog, "\n" + repository); // all done repCounter++; System.out.println(slotId + "--> #" + repCounter + ": " + repository); // clean up the slot slots[slotId] = false; } /** * The synchronised method that will write our files * @param localPath */ private synchronized void writeBigFiles(final File localPath, int slotId) { // now archive these files bigArchive.setBasePath(folderDownload.getAbsolutePath() + "/" + slotId); bigArchive.addFolder(localPath); } /** * Get the files from a given repository on github * @param slotId The thread where the download is happening * @param localPath The bigArchive on disk where the files will be placed * @param location The username/repository identification. We'd * expect something like triplecheck/reporter * @return True if everything went ok, false if something went wrong */ public Boolean download(final int slotId, final File localPath, final String location) { // we can't have any older files files.deleteDir(localPath); final String REMOTE_URL = "https://github.com/" + location + ".git"; try { Git.cloneRepository().setURI(REMOTE_URL).setDirectory(localPath).call(); // now open the created repository FileRepositoryBuilder builder = new FileRepositoryBuilder(); Repository repository = builder.setGitDir(localPath).readEnvironment() // scan environment GIT_* variables .findGitDir() // scan up the file system tree .build(); System.out.println(slotId + "-> Downloaded repository: " + repository.getDirectory()); repository.close(); } catch (Exception ex) { Logger.getLogger(Repositories.class.getName()).log(Level.SEVERE, null, ex); // we need to add this event on the exception list utils_deprecated.files.addTextToFile(fileExceptionHappened, "\n" + location); // delete the files if possible files.deleteDir(localPath); System.out.println(slotId + " !!!! Failed to download: " + location); // clean up the slot slots[slotId] = false; return false; } System.out.println(slotId + "-> Downloaded: " + location); return true; } /** * Use the SVN binary installed on a Linux/OSX machine to export the contents * from an online repository * @param userRepository A combination of user and repository. e.g. userA/repB * @param exportPath To where we want to download the files */ private void svnExport(final String userRepository, final File exportPath) { Process p; try { // use the locally installed SVN to export a bigArchive structure p = Runtime.getRuntime().exec("svn export --force " + "https://github.com/" + userRepository + "/trunk" + " " + exportPath.getAbsolutePath()); int exitVal = p.waitFor(); } catch (IOException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } catch (InterruptedException ex) { Logger.getLogger(RepDownload.class.getName()).log(Level.SEVERE, null, ex); } } /** * Delete unwanted files from a given repository that was freshly * downloaded */ private void deleteUnwantedFiles(final File where, int maxDeep) { // list the files on the current directory File[] files = where.listFiles(); // no need to continue if nothing was found if (files == null) { return; } // go through each file for (File file : files) { if (file.isFile()) { // Add the file to our archive deleteUnwantedFile(file); } else if ((file.isDirectory()) && (maxDeep - 1 > 0)) { // do the recursive crawling deleteUnwantedFiles(file, maxDeep - 1); } } } /** * Delete a file if it recognised as non-desired * @param file The file to analyse */ private void deleteUnwantedFile(final File file) { // get the file name final String fileName = file.getName(); // do our tests // at the moment we only want java files if (fileName.endsWith(wantedFileType) == false) { FileDeleteStrategy.FORCE.deleteQuietly(file); } } }