Java tutorial
package org.gbif.refine.datasets.taibif; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.NameUsageMatch; import org.gbif.api.model.common.LinneanClassification; import org.gbif.api.service.checklistbank.NameUsageMatchingService; import org.gbif.api.vocabulary.Rank; import org.gbif.io.CSVReader; import org.gbif.io.CSVReaderFactory; import org.gbif.refine.client.WebserviceClientModule; import org.gbif.refine.utils.Constants; import org.gbif.refine.utils.FileUtils; import org.gbif.refine.utils.TermUtils; import org.gbif.utils.file.ClosableReportingIterator; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Writer; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.Locale; import java.util.Set; import javax.validation.constraints.NotNull; import com.google.common.collect.Sets; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class is used to clean, augment, and transform the original Fish Assemblages dataset published in Scientific * Data into a DwC sample event, star-formatted dataset consisting of event records (core records) and their associated * occurrences (extension records). */ public class FishAssemblages { private static final Logger LOG = LoggerFactory.getLogger(FishAssemblages.class); private static final NameUsageMatchingService MATCHING_SERVICE = WebserviceClientModule .webserviceClientReadOnly().getInstance(NameUsageMatchingService.class); public static void main(String[] args) throws IOException { // directory where files should be written to File output = org.gbif.utils.file.FileUtils.createTempDir(); processFish(output); LOG.info("Processing 1987-1990_UTF8.txt complete! event.txt and occurrence.txt written to: " + output.getAbsolutePath()); } /** * Iterates over original source file and does the following: * i) cleans it (e.g. maps column header names to DwC term names, converts dates to ISO format, etc) * ii) augments it (e.g. adds new columns for sample size, higher taxonomy, etc) * iii) transforms it into star format (core file events.txt is list of unique sampling events, and extension file * occurrence.txt is a list of all observations derived from all sampling events) * * @param output directory to write files to * * @throws IOException if method fails */ public static void processFish(File output) throws IOException { // load the original source file to process InputStream fis = FishAssemblages.class.getResourceAsStream("/datasets/taibif/1987-1990_UTF8.txt"); // create an iterator on the file CSVReader reader = CSVReaderFactory.build(fis, "UTF-8", "\t", null, 1); // get header row for the new event and occurrence files that this method will output String[] header = getHeader(); // sampling events file Writer writerEvents = FileUtils.startEventsFile(output, header); // observations file Writer writerOccs = FileUtils.startOccurrencesFile(output, header); // to capture all unique eventIDs Set<String> events = Sets.newHashSet(); // to capture bad names Set<String> namesNotFound = Sets.newTreeSet(); ClosableReportingIterator<String[]> iter = null; int line = 0; try { iter = reader.iterator(); while (iter.hasNext()) { line++; String[] record = iter.next(); if (record == null || record.length == 0) { continue; } // create new augmented record String[] modifiedRecord = Arrays.copyOf(record, header.length); // convert year and month into ISO format String year = modifiedRecord[1]; String month = modifiedRecord[2]; if (year.length() == 4 && month.length() == 3) { String concatenatedDate = year + "-" + month; DateFormat concatenatedDf = new SimpleDateFormat("yy-MMM", Locale.ENGLISH); Date concatenatedEventDate = concatenatedDf.parse(concatenatedDate); String concatenatedIsoDate = Constants.ISO_DF_SHORT.format(concatenatedEventDate); // quality control: ensure year and month are same as eventDate (if eventDate provided) String verbatimEventDate = modifiedRecord[3]; if (!verbatimEventDate.isEmpty()) { // convert event date (e.g. 1987/03/) into ISO format (e.g. 1987-03) DateFormat df = new SimpleDateFormat("yy/MM/", Locale.ENGLISH); Date eventDate = df.parse(verbatimEventDate); String isoDate = Constants.ISO_DF_SHORT.format(eventDate); if (!isoDate.equals(concatenatedIsoDate)) { LOG.error("Skipping record: year " + year + " & month " + month + " don't match eventDate " + isoDate); continue; } } modifiedRecord[3] = concatenatedIsoDate; } else { LOG.error("Skipping record: invalid year (" + year + ") and month (" + month + ")"); continue; } modifiedRecord[4] = modifiedRecord[4].toUpperCase(); // occurrenceStatus (present vs absent) // TODO: confirm there are absence records! Indeed there are records missing individualCount if (modifiedRecord[10].isEmpty()) { modifiedRecord[16] = Constants.ABSENT; } else { modifiedRecord[16] = TermUtils.getOccurrenceStatus(Integer.valueOf(modifiedRecord[10])) .toString().toLowerCase(); } // add static values modifiedRecord[17] = "Taiwan"; // country modifiedRecord[18] = "TW"; // countryCode // static values, based on which nuclear power plant it is: N1 or N2 if (modifiedRecord[4].equals("N1")) { modifiedRecord[19] = "Nuclear Power Plant at Shihmen"; // locality modifiedRecord[20] = "25 17 9 N, 121 35 10 E"; // verbatimCoordinates modifiedRecord[21] = "25.28583"; // decimalLatitude modifiedRecord[22] = "121.5861"; // decimalLongitude } else { modifiedRecord[19] = "Nuclear Power Plant at Yehliu"; // locality modifiedRecord[20] = "25 12 10 N, 121 39 45 E"; // verbatimCoordinates modifiedRecord[21] = "25.20278"; // decimalLatitude modifiedRecord[22] = "121.6625"; // decimalLongitude } modifiedRecord[23] = "fish samples were collected monthly from the intake screens at nuclear power plant for 24h (from 9 AM to 9 AM) on the date chosen by a systematic sampling method (Cochran, W. G. Sampling Techniques. 3rd ed. (John Wiley & Sons, 1977)"; // samplingProtocol modifiedRecord[24] = "24"; // sampleSizeValue modifiedRecord[25] = "hour"; // sampleSizeUnit modifiedRecord[26] = "24hr"; // samplingEffort modifiedRecord[27] = "http://creativecommons.org/publicdomain/zero/1.0/legalcode"; // license modifiedRecord[28] = "Event"; // type modifiedRecord[29] = "Chen H, Liao Y, Chen C, Tsai J, Chen L, Shao K"; // rightsHolder modifiedRecord[31] = "MaterialSample"; // basisOfRecord modifiedRecord[32] = "Dr. Kwang-Tsao Shao and the senior laboratory members"; // identifiedBy modifiedRecord[33] = "Identification done using plenty of handbooks of field guide and identification keys."; // identifiedBy modifiedRecord[35] = "individuals"; // organismQuantityType modifiedRecord[36] = "Animalia"; // kingdom modifiedRecord[37] = "Chordata"; // phylum // store organismQuantity modifiedRecord[34] = modifiedRecord[11]; // same as individualCount // construct unique eventID for this sampling period // Format: "urn:[institutionID]:[eventDate]:[locationID]" // Example: "urn:taibif:1987-08:N2" modifiedRecord[0] = "urn:taibif:" + modifiedRecord[3] + ":" + modifiedRecord[4]; // verify taxonomy String name = modifiedRecord[8].trim(); // for more accurate match, we take higher taxonomy into consideration LinneanClassification cl = new NameUsage(); cl.setFamily(modifiedRecord[6]); // only if binomial, set species if (name.split(" ").length == 2 && !name.endsWith("spp.")) { cl.setSpecies(name); // lowest rank specified Rank rank = TermUtils.lowestRank(cl); if (rank != null) { modifiedRecord[43] = rank.toString().toLowerCase(); } // verify name, and add higher taxonomy NameUsageMatch match = MATCHING_SERVICE.match(name, rank, cl, false, false); if (match.getMatchType().equals(NameUsageMatch.MatchType.EXACT)) { modifiedRecord[36] = match.getKingdom(); modifiedRecord[37] = match.getPhylum(); modifiedRecord[38] = match.getClazz(); modifiedRecord[39] = match.getOrder(); modifiedRecord[40] = match.getFamily(); modifiedRecord[41] = match.getGenus(); modifiedRecord[42] = match.getScientificName(); modifiedRecord[43] = match.getRank().toString(); modifiedRecord[44] = match.getUsageKey().toString(); modifiedRecord[45] = match.getStatus().toString(); } else { if (!namesNotFound.contains(name)) { LOG.error(match.getMatchType().toString() + " match for: " + name + " (with rank " + rank + ") to: " + match.getScientificName() + " (with rank " + match.getRank() + ")" + ". See example record with eventDate: " + modifiedRecord[0]); namesNotFound.add(name); } } } else { namesNotFound.add(name); } // construct unique occurrenceID for this abundance record: // Format: "urn:[institutionCode]:[eventDate]:[locationID]:[taxonID]" // Example: "urn:taibif:1994-08:N2:1301" modifiedRecord[30] = modifiedRecord[0] + modifiedRecord[44]; // always output line to new occurrences file String row = FileUtils.tabRow(modifiedRecord); writerOccs.write(row); // only output line to events file if event hasn't been included yet String eventID = modifiedRecord[0]; if (!events.contains(eventID)) { writerEvents.write(row); events.add(eventID); } } LOG.info("Iterated over " + line + " rows."); LOG.info("Found " + events.size() + " unique events."); LOG.warn("***** " + namesNotFound.size() + " names not found in taxa list: "); for (String notFound : namesNotFound) { LOG.warn(notFound); } } catch (Exception e) { // some error validating this file, report LOG.error("Exception caught while iterating over file", e); } finally { if (iter != null) { iter.close(); } reader.close(); writerEvents.close(); writerOccs.close(); } } /** * @return array of column names in output files (event.txt, occurrence.txt) */ @NotNull private static String[] getHeader() { String[] header = new String[46]; // ***original columns // header 0: ID, always empty, so convert to dwc:eventID header[0] = "eventID"; // header 1: year, e.g. 1987 // maps to dwc:year header[1] = "year"; // header 2: month, e.g. "Mar" // maps to dwc:month header[2] = "month"; // header 3: (eventDate), e.g. 1987/03/ // converted to ISO format 1994-08 // maps to dwc:eventDate header[3] = "eventDate"; // header 4: Station, e.g. "N2", "N1" // maps to dwc:locationID header[4] = "locationID"; // header 5: Sample, sparsely populated header[5] = "Sample"; // header 6: Family, e.g. "Acanthuridae" // maps to dwc:family header[6] = "family"; // header 7: ?? (Family), e.g. "" header[7] = "family_zh"; // header 8: TL(cm), e.g. "Naso lituratus" // maps to dwc:scientificName header[8] = "scientificName"; // header 9: ?? (vernacularName in ZH_tw), e.g. "" // maps to dwc:vernacularName header[9] = "vernacularName"; // header 10: (individualCount), e.g. 1 // maps to dwc:individualCount header[10] = "individualCount"; // header 11: Weight(Tol), e.g. 23 // TODO: MoF? header[11] = "Weight(Tol)"; // header 12: Weight(Mean), e.g. 66,5 // TODO: MoF? header[12] = "Weight(Mean)"; // header 13: New, always empty header[13] = "New"; // header 14: Note, e.g. "?" // maps to dwc:eventRemarks // TODO: add "The samples collected up to April 1990 were recorded as presence-absence data only. From September 2000 on, the samples were recorded quantitatively, i.e., the number of fish of each species was recorded." header[14] = "eventRemarks"; // header 15: TL(cm), e.g. 15.5~16 // TODO: MoF? header[15] = "TL(cm)"; // ***new augmented columns of information // present or absent - depending on individualCount (samples collected up to April 1990 were recorded as presence-absence data only) header[16] = "occurrenceStatus"; // Taiwan header[17] = "country"; // TW header[18] = "countryCode"; // Nuclear Power Plant N1 or N2 header[19] = "locality"; // 1st Nuclear Power Plant at Shihmen (25 17 9 N, 121 35 10 E) [25.28583, 121.5861] // 2nd Nuclear Power Plant at Yehliu (25 12 10 N, 121 39 45 E) [25.20278, 121.6625] header[20] = "verbatimCoordinates"; // 25.28583 or 25.20278 header[21] = "decimalLatitude"; // 121.5861 or 121.6625 header[22] = "decimalLongitude"; // fish samples were collected monthly from the intake screens at nuclear power plant for 24h (from 9 AM to 9 AM) on the date chosen by a systematic sampling method (Cochran, W. G. Sampling Techniques. 3rd ed. (John Wiley & Sons, 1977), except during the maintenance period which is about one month during winter to spring seasons. header[23] = "samplingProtocol"; // time duration in number of collection hours (24) header[24] = "sampleSizeValue"; // hour header[25] = "sampleSizeUnit"; // number of collection hours (24 hours) header[26] = "samplingEffort"; // http://creativecommons.org/publicdomain/zero/1.0/legalcode (since data in Dryad is licensed under CC0 - http://datadryad.org/resource/doi:10.5061/dryad.m777t?show=full) header[27] = "license"; // Event header[28] = "type"; // Chen H, Liao Y, Chen C, Tsai J, Chen L, Shao K header[29] = "rightsHolder"; // unique occurrenceID header[30] = "occurrenceID"; // MaterialSample header[31] = "basisOfRecord"; // Dr. Kwang-Tsao Shao and the senior laboratory members header[32] = "identifiedBy"; // Identification done using plenty of handbooks of field guide and identification keys. header[33] = "identificationRemarks"; // copied from individualCount header[34] = "organismQuantity"; // individuals header[35] = "organismQuantityType"; // taxonomy header[36] = "kingdom_gbif"; header[37] = "phylum_gbif"; header[38] = "class_gbif"; header[39] = "order_gbif"; header[40] = "family_gbif"; header[41] = "genus_gbif"; header[42] = "scientificName_gbif"; header[43] = "taxonRank"; header[44] = "taxonID_gbif"; header[45] = "taxonomicStatus_gbif"; return header; } }