List of usage examples for java.util Map.Entry get
V get(Object key);
From source file:org.springframework.boot.cli.command.init.ProjectGenerationRequest.java
private static void filter(Map<String, ProjectType> projects, String tag, String tagValue) { for (Iterator<Map.Entry<String, ProjectType>> it = projects.entrySet().iterator(); it.hasNext();) { Map.Entry<String, ProjectType> entry = it.next(); String value = entry.getValue().getTags().get(tag); if (!tagValue.equals(value)) { it.remove();/*from w ww .j av a2 s . c o m*/ } } }
From source file:io.Tools.java
/** * Create test PDB and Chemcomp folder. Also all PDB files in resources are copied there so all test can use this * folder/*from w w w . j av a 2 s. c o m*/ * * @return */ public static String createPermanentTestFolder() { String d = System.getProperty("user.home"); String builtTestFolder = d + File.separator + "Documents" + File.separator + testFolderName + File.separator; final File baseDir = new File(builtTestFolder); String builttestPDBFolder = builtTestFolder + File.separator + "pdb"; baseDir.mkdirs(); final File pdbDir = new File(builttestPDBFolder); if (Files.exists(Paths.get(builttestPDBFolder))) { try { FileUtils.deleteDirectory(pdbDir); } catch (IOException e) { } } pdbDir.mkdir(); String builttestChemcompFolder = builtTestFolder + File.separator + "chemcomp"; final File chemcompDir = new File(builttestChemcompFolder); if (Files.exists(Paths.get(builttestChemcompFolder))) { try { FileUtils.deleteDirectory(chemcompDir); } catch (IOException e) { } } chemcompDir.mkdirs(); pdbDir.mkdir(); testChemcompFolder = builtTestFolder; testPDBFolder = builttestPDBFolder; String resourcesPDBFolder = null; try { URL url = BiojavaReaderFromPDBFolderTest.class.getClassLoader().getResource("pdb/1di9.cif.gz"); File pdb1di9file = new File(url.toURI()); resourcesPDBFolder = pdb1di9file.getParent(); Map<String, List<MMcifFileInfos>> indexPDBFileInFolder = IOTools .indexPDBFileInFolder(new File(resourcesPDBFolder).toString()); for (Map.Entry<String, List<MMcifFileInfos>> entry : indexPDBFileInFolder.entrySet()) { try { FileUtils.copyFileToDirectory(new File(entry.getValue().get(0).getPathToFile().toString()), pdbDir); } catch (IOException e) { e.printStackTrace(); } } } catch (URISyntaxException e) { e.printStackTrace(); } String resourcesChemcompFolder = null; try { URL url = BiojavaReaderFromPDBFolderTest.class.getClassLoader().getResource("chemcomp/0DY.cif.gz"); File chemcomp0DY = new File(url.toURI()); resourcesChemcompFolder = chemcomp0DY.getParent(); Map<String, List<Path>> indexPDBFileInFolder = IOTools .indexChemcompFileInFolder(new File(resourcesChemcompFolder).toString()); for (Map.Entry<String, List<Path>> entry : indexPDBFileInFolder.entrySet()) { try { FileUtils.copyFileToDirectory(new File(entry.getValue().get(0).toString()), new File(builttestChemcompFolder)); } catch (IOException e) { e.printStackTrace(); } } } catch (URISyntaxException e) { e.printStackTrace(); } return testChemcompFolder; }
From source file:com.mirth.connect.server.util.AttachmentUtil.java
public static byte[] reAttachMessage(String raw, ImmutableConnectorMessage connectorMessage, String charsetEncoding, boolean binary) { try {// www . ja va 2 s.c o m Map<Integer, Map<Integer, Object>> replacementObjects = new TreeMap<Integer, Map<Integer, Object>>(); // Determine the buffersize during the first pass for better memory performance int bufferSize = raw.length(); int index = 0; int endIndex; // Initialize the objects here so only one retrieval of the attachment content is ever needed. byte[] dicomObject = null; Map<String, Attachment> attachmentMap = null; // Handle the special case if only a dicom message is requested. // In this case we can skip any byte appending and thus do not need to base64 encode the dicom object // if the type is binary. if (raw.trim().equals(PREFIX + DICOM_KEY + SUFFIX)) { dicomObject = DICOMUtil.getDICOMRawBytes(connectorMessage); if (!binary) { dicomObject = Base64Util.encodeBase64(dicomObject); } return dicomObject; } // Check the raw string in one pass for any attachments. // Stores the start and end indices to replace, along with the attachment content. while ((index = raw.indexOf(PREFIX, index)) != -1) { if (raw.startsWith(DICOM_KEY + SUFFIX, index + PREFIX.length())) { if (dicomObject == null) { // Unfortunately, if the dicom data needs to appended to other base64 data, it must be done so in base64. dicomObject = Base64Util.encodeBase64(DICOMUtil.getDICOMRawBytes(connectorMessage)); } endIndex = index + PREFIX.length() + DICOM_KEY.length() + SUFFIX.length(); Map<Integer, Object> replacementMap = new HashMap<Integer, Object>(); replacementMap.put(KEY_END_INDEX, endIndex); replacementMap.put(KEY_DATA, dicomObject); replacementObjects.put(index, replacementMap); bufferSize += dicomObject.length; index += endIndex - index; } else if (raw.startsWith(ATTACHMENT_KEY, index + PREFIX.length())) { if (attachmentMap == null) { List<Attachment> list = getMessageAttachments(connectorMessage); // Store the attachments in a map with the attachment's Id as the key attachmentMap = new HashMap<String, Attachment>(); for (Attachment attachment : list) { attachmentMap.put(attachment.getId(), attachment); } } int attachmentIdStartIndex = index + PREFIX.length() + ATTACHMENT_KEY.length(); int attachmentIdEndIndex = attachmentIdStartIndex + ATTACHMENT_ID_LENGTH; endIndex = attachmentIdEndIndex + SUFFIX.length(); String attachmentId = raw.substring(attachmentIdStartIndex, attachmentIdStartIndex + ATTACHMENT_ID_LENGTH); if (raw.substring(attachmentIdEndIndex, endIndex).equals(SUFFIX)) { Map<Integer, Object> replacementMap = new HashMap<Integer, Object>(); replacementMap.put(KEY_END_INDEX, endIndex); if (attachmentMap.containsKey(attachmentId)) { Attachment attachment = attachmentMap.get(attachmentId); replacementMap.put(KEY_DATA, attachment.getContent()); bufferSize += attachment.getContent().length; } else { replacementMap.put(KEY_DATA, new byte[0]); } replacementObjects.put(index, replacementMap); } } else { endIndex = index + PREFIX.length(); } index += endIndex - index; } // Release the object pointers of the attachment content so they aren't held in memory for the entire method dicomObject = null; attachmentMap = null; // Initialize the stream's buffer size. The buffer size will always be slightly large than needed, // because the template keys are never removed from the buffer size. // It is not worth doing any extra calculations for the amount of memory saved. ByteArrayOutputStream baos = new ByteArrayOutputStream(bufferSize); int segmentStartIndex = 0; for (Map.Entry<Integer, Map<Integer, Object>> entry : replacementObjects.entrySet()) { int startReplacementIndex = entry.getKey(); int endReplacementIndex = (Integer) entry.getValue().get(KEY_END_INDEX); byte[] data = (byte[]) entry.getValue().get(KEY_DATA); // Allows the memory used by the attachments to be released at the end of the loop entry.getValue().clear(); byte[] templateSegment; // If the data is binary, the content should be in base64, so using US-ASCII as the charset encoding should be sufficient. if (binary) { templateSegment = StringUtils .getBytesUsAscii(raw.substring(segmentStartIndex, startReplacementIndex)); } else { templateSegment = StringUtil.getBytesUncheckedChunked( raw.substring(segmentStartIndex, startReplacementIndex), Constants.ATTACHMENT_CHARSET); } baos.write(templateSegment); baos.write(data); segmentStartIndex = endReplacementIndex; } byte[] templateSegment; if (binary) { templateSegment = StringUtils.getBytesUsAscii(raw.substring(segmentStartIndex)); } else { templateSegment = StringUtil.getBytesUncheckedChunked(raw.substring(segmentStartIndex), Constants.ATTACHMENT_CHARSET); } byte[] combined; // If there are no attachments, don't bother writing to the output stream. if (segmentStartIndex == 0) { combined = templateSegment; } else { // Write the segment after the last replacement. baos.write(templateSegment); combined = baos.toByteArray(); // Release the memory used by the byte array stream. ByteArrayOutputStreams do not need to be closed. baos = null; } templateSegment = null; // If binary, the content should be in base64 so it is necessary to decode the data. if (binary) { combined = Base64Util.decodeBase64(combined); } else if (charsetEncoding != null && !charsetEncoding.toUpperCase().equals(Constants.ATTACHMENT_CHARSET.toUpperCase())) { // Convert the byte array to a string using the internal encoding. String combinedString = StringUtils.newString(combined, Constants.ATTACHMENT_CHARSET); // First release the reference to the old byte data so it can be reallocated if necessary. combined = null; // Convert the string to a byte array using the requested encoding combined = StringUtil.getBytesUncheckedChunked(combinedString, charsetEncoding); } return combined; } catch (Exception e) { logger.error("Error reattaching attachments", e); return null; } }
From source file:com.mirth.connect.server.util.MessageAttachmentUtil.java
public static byte[] reAttachMessage(String raw, ImmutableConnectorMessage connectorMessage, String charsetEncoding, boolean binary) { try {//w w w . j av a 2s . co m Map<Integer, Map<Integer, Object>> replacementObjects = new TreeMap<Integer, Map<Integer, Object>>(); // Determine the buffersize during the first pass for better memory performance int bufferSize = raw.length(); int index = 0; int endIndex; // Initialize the objects here so only one retrieval of the attachment content is ever needed. byte[] dicomObject = null; Map<String, Attachment> attachmentMap = null; // Handle the special case if only a dicom message is requested. // In this case we can skip any byte appending and thus do not need to base64 encode the dicom object // if the type is binary. if (raw.trim().equals(PREFIX + DICOM_KEY + SUFFIX)) { dicomObject = DICOMMessageUtil.getDICOMRawBytes(connectorMessage); if (!binary) { dicomObject = Base64Util.encodeBase64(dicomObject); } return dicomObject; } // Check the raw string in one pass for any attachments. // Stores the start and end indices to replace, along with the attachment content. while ((index = raw.indexOf(PREFIX, index)) != -1) { if (raw.startsWith(DICOM_KEY + SUFFIX, index + PREFIX.length())) { if (dicomObject == null) { // Unfortunately, if the dicom data needs to appended to other base64 data, it must be done so in base64. dicomObject = Base64Util.encodeBase64(DICOMMessageUtil.getDICOMRawBytes(connectorMessage)); } endIndex = index + PREFIX.length() + DICOM_KEY.length() + SUFFIX.length(); Map<Integer, Object> replacementMap = new HashMap<Integer, Object>(); replacementMap.put(KEY_END_INDEX, endIndex); replacementMap.put(KEY_DATA, dicomObject); replacementObjects.put(index, replacementMap); bufferSize += dicomObject.length; index += endIndex - index; } else if (raw.startsWith(ATTACHMENT_KEY, index + PREFIX.length())) { if (attachmentMap == null) { List<Attachment> list = getMessageAttachments(connectorMessage); // Store the attachments in a map with the attachment's Id as the key attachmentMap = new HashMap<String, Attachment>(); for (Attachment attachment : list) { attachmentMap.put(attachment.getId(), attachment); } } int attachmentIdStartIndex = index + PREFIX.length() + ATTACHMENT_KEY.length(); int attachmentIdEndIndex = attachmentIdStartIndex + ATTACHMENT_ID_LENGTH; endIndex = attachmentIdEndIndex + SUFFIX.length(); String attachmentId = raw.substring(attachmentIdStartIndex, attachmentIdStartIndex + ATTACHMENT_ID_LENGTH); if (raw.substring(attachmentIdEndIndex, endIndex).equals(SUFFIX)) { Map<Integer, Object> replacementMap = new HashMap<Integer, Object>(); replacementMap.put(KEY_END_INDEX, endIndex); if (attachmentMap.containsKey(attachmentId)) { Attachment attachment = attachmentMap.get(attachmentId); replacementMap.put(KEY_DATA, attachment.getContent()); bufferSize += attachment.getContent().length; } else { replacementMap.put(KEY_DATA, new byte[0]); } replacementObjects.put(index, replacementMap); } } else { endIndex = index + PREFIX.length(); } index += endIndex - index; } // Release the object pointers of the attachment content so they aren't held in memory for the entire method dicomObject = null; attachmentMap = null; // Initialize the stream's buffer size. The buffer size will always be slightly large than needed, // because the template keys are never removed from the buffer size. // It is not worth doing any extra calculations for the amount of memory saved. ByteArrayOutputStream baos = new ByteArrayOutputStream(bufferSize); int segmentStartIndex = 0; for (Map.Entry<Integer, Map<Integer, Object>> entry : replacementObjects.entrySet()) { int startReplacementIndex = entry.getKey(); int endReplacementIndex = (Integer) entry.getValue().get(KEY_END_INDEX); byte[] data = (byte[]) entry.getValue().get(KEY_DATA); // Allows the memory used by the attachments to be released at the end of the loop entry.getValue().clear(); byte[] templateSegment; // If the data is binary, the content should be in base64, so using US-ASCII as the charset encoding should be sufficient. if (binary) { templateSegment = StringUtils .getBytesUsAscii(raw.substring(segmentStartIndex, startReplacementIndex)); } else { templateSegment = StringUtil.getBytesUncheckedChunked( raw.substring(segmentStartIndex, startReplacementIndex), Constants.ATTACHMENT_CHARSET); } baos.write(templateSegment); baos.write(data); segmentStartIndex = endReplacementIndex; } byte[] templateSegment; if (binary) { templateSegment = StringUtils.getBytesUsAscii(raw.substring(segmentStartIndex)); } else { templateSegment = StringUtil.getBytesUncheckedChunked(raw.substring(segmentStartIndex), Constants.ATTACHMENT_CHARSET); } byte[] combined; // If there are no attachments, don't bother writing to the output stream. if (segmentStartIndex == 0) { combined = templateSegment; } else { // Write the segment after the last replacement. baos.write(templateSegment); combined = baos.toByteArray(); // Release the memory used by the byte array stream. ByteArrayOutputStreams do not need to be closed. baos = null; } templateSegment = null; // If binary, the content should be in base64 so it is necessary to decode the data. if (binary) { combined = Base64Util.decodeBase64(combined); } else if (charsetEncoding != null && !charsetEncoding.toUpperCase().equals(Constants.ATTACHMENT_CHARSET.toUpperCase())) { // Convert the byte array to a string using the internal encoding. String combinedString = StringUtils.newString(combined, Constants.ATTACHMENT_CHARSET); // First release the reference to the old byte data so it can be reallocated if necessary. combined = null; // Convert the string to a byte array using the requested encoding combined = StringUtil.getBytesUncheckedChunked(combinedString, charsetEncoding); } return combined; } catch (Exception e) { logger.error("Error reattaching attachments", e); return null; } }
From source file:eu.delving.sip.files.StorageHelper.java
static Collection<File> findLatestPrefixFiles(File dir, Storage.FileType fileType) { File[] files = dir.listFiles(new PrefixFileFilter(fileType)); Map<String, List<File>> map = new TreeMap<String, List<File>>(); for (File file : files) { String prefix = extractName(file, fileType); if (prefix == null) continue; List<File> list = map.get(prefix); if (list == null) { map.put(prefix, list = new ArrayList<File>()); }/* w ww.j a va2s. co m*/ list.add(file); } List<File> latestFiles = new ArrayList<File>(); for (Map.Entry<String, List<File>> entry : map.entrySet()) { if (entry.getValue().size() == 1) { latestFiles.add(entry.getValue().get(0)); } else { latestFiles .add(getRecent(entry.getValue().toArray(new File[entry.getValue().size()]), 0, fileType)); } } return latestFiles; }
From source file:org.elasticsearch.client.sniff.ElasticsearchNodesSnifferTests.java
private static SniffResponse buildSniffResponse(ElasticsearchNodesSniffer.Scheme scheme) throws IOException { int numNodes = RandomNumbers.randomIntBetween(getRandom(), 1, 5); List<Node> nodes = new ArrayList<>(numNodes); JsonFactory jsonFactory = new JsonFactory(); StringWriter writer = new StringWriter(); JsonGenerator generator = jsonFactory.createGenerator(writer); generator.writeStartObject();//w w w . j a v a2 s . c o m if (getRandom().nextBoolean()) { generator.writeStringField("cluster_name", "elasticsearch"); } if (getRandom().nextBoolean()) { generator.writeObjectFieldStart("bogus_object"); generator.writeEndObject(); } generator.writeObjectFieldStart("nodes"); for (int i = 0; i < numNodes; i++) { String nodeId = RandomStrings.randomAsciiOfLengthBetween(getRandom(), 5, 10); String host = "host" + i; int port = RandomNumbers.randomIntBetween(getRandom(), 9200, 9299); HttpHost publishHost = new HttpHost(host, port, scheme.toString()); Set<HttpHost> boundHosts = new HashSet<>(); boundHosts.add(publishHost); if (randomBoolean()) { int bound = between(1, 5); for (int b = 0; b < bound; b++) { boundHosts.add(new HttpHost(host + b, port, scheme.toString())); } } int numAttributes = between(0, 5); Map<String, List<String>> attributes = new HashMap<>(numAttributes); for (int j = 0; j < numAttributes; j++) { int numValues = frequently() ? 1 : between(2, 5); List<String> values = new ArrayList<>(); for (int v = 0; v < numValues; v++) { values.add(j + "value" + v); } attributes.put("attr" + j, values); } Node node = new Node(publishHost, boundHosts, randomAsciiAlphanumOfLength(5), randomAsciiAlphanumOfLength(5), new Node.Roles(randomBoolean(), randomBoolean(), randomBoolean()), attributes); generator.writeObjectFieldStart(nodeId); if (getRandom().nextBoolean()) { generator.writeObjectFieldStart("bogus_object"); generator.writeEndObject(); } if (getRandom().nextBoolean()) { generator.writeArrayFieldStart("bogus_array"); generator.writeStartObject(); generator.writeEndObject(); generator.writeEndArray(); } boolean isHttpEnabled = rarely() == false; if (isHttpEnabled) { nodes.add(node); generator.writeObjectFieldStart("http"); generator.writeArrayFieldStart("bound_address"); for (HttpHost bound : boundHosts) { generator.writeString(bound.toHostString()); } generator.writeEndArray(); if (getRandom().nextBoolean()) { generator.writeObjectFieldStart("bogus_object"); generator.writeEndObject(); } generator.writeStringField("publish_address", publishHost.toHostString()); if (getRandom().nextBoolean()) { generator.writeNumberField("max_content_length_in_bytes", 104857600); } generator.writeEndObject(); } List<String> roles = Arrays.asList(new String[] { "master", "data", "ingest" }); Collections.shuffle(roles, getRandom()); generator.writeArrayFieldStart("roles"); for (String role : roles) { if ("master".equals(role) && node.getRoles().isMasterEligible()) { generator.writeString("master"); } if ("data".equals(role) && node.getRoles().isData()) { generator.writeString("data"); } if ("ingest".equals(role) && node.getRoles().isIngest()) { generator.writeString("ingest"); } } generator.writeEndArray(); generator.writeFieldName("version"); generator.writeString(node.getVersion()); generator.writeFieldName("name"); generator.writeString(node.getName()); if (numAttributes > 0) { generator.writeObjectFieldStart("attributes"); for (Map.Entry<String, List<String>> entry : attributes.entrySet()) { if (entry.getValue().size() == 1) { generator.writeStringField(entry.getKey(), entry.getValue().get(0)); } else { for (int v = 0; v < entry.getValue().size(); v++) { generator.writeStringField(entry.getKey() + "." + v, entry.getValue().get(v)); } } } generator.writeEndObject(); } generator.writeEndObject(); } generator.writeEndObject(); generator.writeEndObject(); generator.close(); return SniffResponse.buildResponse(writer.toString(), nodes); }
From source file:com.iti.request.NearbyService.java
public static List<String> getTrains(City from, City to) { Map<String, List<City>> trainsSchedule = getTrainsSchedule(); List<String> result = new ArrayList<>(); for (Map.Entry<String, List<City>> train : trainsSchedule.entrySet()) { int fromIndex = train.getValue().indexOf(from); int toIndex = train.getValue().indexOf(to); if (fromIndex == -1 || toIndex == -1) { continue; } else if (fromIndex >= toIndex) { continue; } else {/*from w ww .j a v a 2 s. co m*/ result.add(" :" + train.getKey() + " " + train.getValue().get(fromIndex).getTime()); } } return result; }
From source file:com.incapture.rapgen.output.OutputWriter.java
/** * Some files are composed of multiple templates. So the map passed in here is filename to template order to template. * E.g. "file.txt"->1->"some code" "file.txt"->2->"other code" and so on. * * @param rootFolder//from w w w . j a v a 2 s. c o m * @param pathToTemplate */ public static void writeMultiPartTemplates(String rootFolder, Map<String, Map<String, StringTemplate>> pathToTemplate) { // For each file, dump the templates for (Map.Entry<String, Map<String, StringTemplate>> entry : pathToTemplate.entrySet()) { File file = new File(rootFolder, entry.getKey()); file.getParentFile().mkdirs(); BufferedWriter bow = null; try { bow = new BufferedWriter(new FileWriter(file)); Set<String> sections = entry.getValue().keySet(); SortedSet<String> sorted = new TreeSet<String>(); sorted.addAll(sections); for (String sec : sorted) { bow.write(entry.getValue().get(sec).toString()); bow.newLine(); } bow.close(); } catch (IOException e) { System.err.println(e.getMessage()); } finally { if (bow != null) { try { bow.close(); } catch (IOException e) { System.err.println("Error closing output stream: " + ExceptionToString.format(e)); } } } } }
From source file:rgu.jclos.foldbuilder.FoldBuilder.java
/** * Generates K folds and writes them to disk * @param inputFile The CSV file from which the data comes from. * @param outputDirectory The directory in which the folds will be written. * @param separator The separating character in the CSV file. * @param indexLabel The index of the labels in the CSV file. Used for stratification of the folds. * @param k The number of folds to generates. * @param speak Whether to print some status messages along the way. * @return A pair containing a list of folds with ids of documents, and a dictionary that allows the user to retrieve aformentioned documents using the ids, in order to save space. * @throws IOException If something stops the program from reading or writing the files. *///from w ww .j a v a 2 s .c o m public static Pair<List<Set<String>>, Map<String, Instance>> getFolds(String inputFile, String outputDirectory, String separator, int indexLabel, int k, boolean speak) throws IOException { Random rng = new Random(); Map<String, Instance> dictionary = new HashMap<>(); Map<String, Integer> classes = new HashMap<>(); Map<String, List<String>> reversedDictionary = new HashMap<>(); int id = 0; for (String line : Files.readAllLines(new File(inputFile).toPath())) { Instance inst = new Instance(); String[] elements = line.split(separator); inst.content = line; inst.label = elements[indexLabel]; String iid = "inst" + id; dictionary.put(iid, inst); classes.put(inst.label, classes.getOrDefault(inst.label, 0) + 1); if (reversedDictionary.containsKey(inst.label)) { reversedDictionary.get(inst.label).add(iid); } else { List<String> ids = new ArrayList<>(); ids.add(iid); reversedDictionary.put(inst.label, ids); } id++; } int numberOfInstances = id; int sizeOfEachFold = (int) Math.floor(numberOfInstances / k); Map<String, Double> classRatios = new HashMap<>(); for (Map.Entry<String, Integer> classFrequency : classes.entrySet()) { classRatios.put(classFrequency.getKey(), (double) classFrequency.getValue() / (double) numberOfInstances); } List<Set<String>> folds = new ArrayList<>(); for (int i = 0; i < k; i++) { Set<String> fold = new HashSet<>(); for (Map.Entry<String, List<String>> c : reversedDictionary.entrySet()) { int currentSize = fold.size(); int numberRequired = (int) Math.floor(classRatios.get(c.getKey()) * sizeOfEachFold); while (fold.size() < currentSize + numberRequired && c.getValue().size() > 0) { int nextPick = rng.nextInt(c.getValue().size()); fold.add(c.getValue().get(nextPick)); c.getValue().remove(nextPick); } } folds.add(fold); if (speak) System.out.println("Finished computing fold " + (i + 1) + " of size " + fold.size()); } if (speak) System.out.println("Writing folds on disk"); return Pair.of(folds, dictionary); }
From source file:rgu.jclos.foldbuilder.FoldBuilder.java
/** * Generates K folds and writes them to disk * @param inputFile The CSV file from which the data comes from. * @param outputDirectory The directory in which the folds will be written. * @param separator The separating character in the CSV file. * @param indexLabel The index of the labels in the CSV file. Used for stratification of the folds. * @param k The number of folds to generates. * @param speak Whether to print some status messages along the way. * @return A pair containing a list of folds with ids of documents, and a dictionary that allows the user to retrieve aformentioned documents using the ids, in order to save space. * @throws IOException If something stops the program from reading or writing the files. *//* ww w. java2 s . co m*/ private static Pair<List<Set<String>>, Map<String, Instance>> getFolds(String inputFile, String outputDirectory, String separator, String indexLabel, int k, boolean speak) throws IOException { Random rng = new Random(); Map<String, Instance> dictionary = new HashMap<>(); Map<String, Integer> classes = new HashMap<>(); Map<String, List<String>> reversedDictionary = new HashMap<>(); int id = 0; List<String> lines = Files.readAllLines(new File(inputFile).toPath()); String[] elts = lines.get(0).split(separator); int labIndex = indexLabel.equals("first") ? 0 : indexLabel.equals("last") ? elts.length - 1 : Integer.parseInt(indexLabel); for (String line : Files.readAllLines(new File(inputFile).toPath())) { Instance inst = new Instance(); String[] elements = line.split(separator); inst.content = line; inst.label = elements[labIndex]; String iid = "inst" + id; dictionary.put(iid, inst); classes.put(inst.label, classes.getOrDefault(inst.label, 0) + 1); if (reversedDictionary.containsKey(inst.label)) { reversedDictionary.get(inst.label).add(iid); } else { List<String> ids = new ArrayList<>(); ids.add(iid); reversedDictionary.put(inst.label, ids); } id++; } int numberOfInstances = id; int sizeOfEachFold = (int) Math.floor(numberOfInstances / k); Map<String, Double> classRatios = new HashMap<>(); for (Map.Entry<String, Integer> classFrequency : classes.entrySet()) { classRatios.put(classFrequency.getKey(), (double) classFrequency.getValue() / (double) numberOfInstances); } List<Set<String>> folds = new ArrayList<>(); for (int i = 0; i < k; i++) { Set<String> fold = new HashSet<>(); for (Map.Entry<String, List<String>> c : reversedDictionary.entrySet()) { int currentSize = fold.size(); int numberRequired = (int) Math.floor(classRatios.get(c.getKey()) * sizeOfEachFold); while (fold.size() < currentSize + numberRequired && c.getValue().size() > 0) { int nextPick = rng.nextInt(c.getValue().size()); fold.add(c.getValue().get(nextPick)); c.getValue().remove(nextPick); } } folds.add(fold); if (speak) System.out.println("Finished computing fold " + (i + 1) + " of size " + fold.size()); } if (speak) System.out.println("Writing folds on disk"); return Pair.of(folds, dictionary); }