Java tutorial
package nl.knaw.huygens.timbuctoo.tools.importer.neww; /* * #%L * Timbuctoo tools * ======= * Copyright (C) 2012 - 2015 Huygens ING * ======= * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-3.0.html>. * #L% */ import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import nl.knaw.huygens.timbuctoo.Repository; import nl.knaw.huygens.timbuctoo.config.TypeRegistry; import nl.knaw.huygens.timbuctoo.index.IndexManager; import nl.knaw.huygens.timbuctoo.model.Collective; import nl.knaw.huygens.timbuctoo.model.Document; import nl.knaw.huygens.timbuctoo.model.Document.DocumentType; import nl.knaw.huygens.timbuctoo.model.DomainEntity; import nl.knaw.huygens.timbuctoo.model.Location; import nl.knaw.huygens.timbuctoo.model.Person; import nl.knaw.huygens.timbuctoo.model.Reference; import nl.knaw.huygens.timbuctoo.model.ckcc.CKCCPerson; import nl.knaw.huygens.timbuctoo.model.neww.WWCollective; import nl.knaw.huygens.timbuctoo.model.neww.WWDocument; import nl.knaw.huygens.timbuctoo.model.neww.WWKeyword; import nl.knaw.huygens.timbuctoo.model.neww.WWLanguage; import nl.knaw.huygens.timbuctoo.model.neww.WWLocation; import nl.knaw.huygens.timbuctoo.model.neww.WWPerson; import nl.knaw.huygens.timbuctoo.model.neww.WWRelation; import nl.knaw.huygens.timbuctoo.model.util.Datable; import nl.knaw.huygens.timbuctoo.model.util.Link; import nl.knaw.huygens.timbuctoo.model.util.PersonName; import nl.knaw.huygens.timbuctoo.storage.ValidationException; import nl.knaw.huygens.timbuctoo.tools.config.ToolsInjectionModule; import nl.knaw.huygens.timbuctoo.tools.importer.DefaultImporter; import nl.knaw.huygens.timbuctoo.util.Text; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Stopwatch; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.inject.Injector; /** * Imports data of the "New European Women Writers" project. * * Usage: * java -cp [specs] ${package-name}.WomenWritersImporter [importDirName] */ public class WomenWritersImporter extends DefaultImporter { private static final Logger LOG = LoggerFactory.getLogger(WomenWritersImporter.class); public static void main(String[] args) throws Exception { Stopwatch stopWatch = Stopwatch.createStarted(); // Handle commandline arguments String directory = (args.length > 0) ? args[0] : "../../timbuctoo-testdata/src/main/resources/neww/"; WomenWritersImporter importer = null; try { Injector injector = ToolsInjectionModule.createInjector(); Repository repository = injector.getInstance(Repository.class); IndexManager indexManager = injector.getInstance(IndexManager.class); importer = new WomenWritersImporter(repository, indexManager, directory); importer.importAll(); } finally { if (importer != null) { importer.close(); } LOG.info("Time used: {}", stopWatch); } } // ------------------------------------------------------------------- /** References of stored primitive entities */ private final Map<String, Reference> references = Maps.newHashMap(); /** Keys of invalid primitive entities */ private final Set<String> invalids = Sets.newHashSet(); /** Special keywords */ private KeywordConcordance keywords; /** For deserializing JSON */ private final ObjectMapper objectMapper; private final File inputDir; public WomenWritersImporter(Repository repository, IndexManager indexManager, String inputDirName) { super(repository, indexManager, "neww"); objectMapper = new ObjectMapper(); inputDir = new File(inputDirName); if (inputDir.isDirectory()) { System.out.printf("%n.. Importing from %s%n", inputDir.getAbsolutePath()); } else { System.out.printf("%n.. Not a directory: %s%n", inputDir.getAbsolutePath()); } } public void importAll() throws Exception { printBoxedText("Initialization"); openImportLog("neww-log.txt"); removeNonPersistentEntities(WWCollective.class); removeNonPersistentEntities(WWDocument.class); removeNonPersistentEntities(WWKeyword.class); removeNonPersistentEntities(WWPerson.class); removeNonPersistentEntities(WWRelation.class); printBoxedText("Import"); importRelationTypes(); setupRelationTypeDefs(); System.out.println(".. Keywords"); System.out.printf("Number = %6d%n%n", importKeywords()); keywords = new KeywordConcordance(repository, change); keywords.handleFile(new File(inputDir, "neww-keywords.txt"), 0, false); System.out.println(".. Languages"); System.out.printf("Number = %6d%n%n", importLanguages()); System.out.println(".. Locations"); System.out.printf("Number = %6d%n%n", importLocations()); System.out.println(".. Collectives"); System.out.printf("Number = %6d%n%n", importCollectives()); System.out.println(".. Documents"); System.out.printf("Source documents = %6d%n%n", importSourceDocuments()); System.out.printf("Regular documents = %6d%n%n", importRegularDocuments()); System.out.println(".. Persons"); Set<String> collaboratorIds = collectCollaborators(); System.out.printf("Number = %6d%n%n", importPersons(collaboratorIds)); collaboratorIds.clear(); System.out.printf("nUnknown %5d%n", nUnknown); System.out.printf("nArchetype %5d%n", nArchetype); System.out.printf("nAuthor %5d%n", nAuthor); System.out.printf("nPseudonym %5d%n", nPseudonym); System.out.printf("nDuplicates %5d%n", nDuplicates); System.out.println(".. Relations"); importRelations(); System.out.printf("Number of missing relation types = %6d%n", missingRelationTypes); System.out.printf("Number of unstored relations = %6d%n", unstoredRelations); System.out.printf("Number of duplicate relations = %6d%n", getDuplicateRelationCount()); printBoxedText("Indexing"); indexEntities(WWCollective.class); indexEntities(WWDocument.class); indexEntities(WWKeyword.class); indexEntities(WWLanguage.class); indexEntities(WWPerson.class); indexEntities(WWLocation.class); indexEntities(WWRelation.class); displayStatus(); displayErrorSummary(); closeImportLog(); } // --- Support --------------------------------------------------------------- /** Returns key for a reference map. */ private String newKey(String name, String id) { return name + ":" + id; } private Reference storeReference(String key, Class<? extends DomainEntity> type, String id) { Reference reference = new Reference(TypeRegistry.toBaseDomainEntity(type), id); references.put(key, reference); return reference; } private LineIterator getLineIterator(String filename) throws IOException { File file = new File(inputDir, filename); return FileUtils.lineIterator(file, "UTF-8"); } private String preprocessJson(String line) { line = StringUtils.stripToEmpty(line); if (line.startsWith("{")) { line = line.replaceAll("\"_id\"", "\"tempid\""); line = line.replaceAll("ObjectId\\(\\s*(\\S+)\\s*\\)", "$1"); return line; } else { System.out.println("## Skipping line: " + line); return ""; } } private void verifyEmptyField(String line, String key, String value) { if (!Strings.isNullOrEmpty(value)) { System.out.println("Unexpected value for: " + key); System.out.println(line); } } private String verifyNonEmptyField(String line, String key, String value) { if (Strings.isNullOrEmpty(value)) { handleError("Missing '%s' in: %s", key, line); } return value; } // --- Collectives ----------------------------------------------------------- private int importCollectives() throws Exception { int initialSize = references.size(); LineIterator iterator = getLineIterator("collectives.json"); try { while (iterator.hasNext()) { String line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { handleCollective(line); } } } finally { LineIterator.closeQuietly(iterator); } return references.size() - initialSize; } private void handleCollective(String json) throws Exception { XCollective object = objectMapper.readValue(json, XCollective.class); String key = newKey("Collective", object.tempid); if (references.containsKey(key)) { handleError("Duplicate id %s", key); } else { WWCollective converted = convert(json, object); if (converted == null) { invalids.add(key); } else { String storedId = addDomainEntity(WWCollective.class, converted); storeReference(key, WWCollective.class, storedId); } } } private WWCollective convert(String line, XCollective object) { WWCollective converted = new WWCollective(); String name = verifyNonEmptyField(line, "name", filterField(object.name)); if (name == null) { return null; } converted.setName(name); String type = verifyNonEmptyField(line, "type", filterField(object.type)); converted.tempType = type; if (type == null || type.equals("membership")) { converted.setType(Collective.Type.UNKNOWN); } else { converted.setType(type); } converted.tempLocationPlacename = filterField(object.location_placename); converted.tempOrigin = filterField(object.origin); String shortName = converted.tempShortName = filterField(object.short_name); if (shortName != null && shortName.matches("[A-Z]{2,6}")) { converted.setAcronym(shortName); } String notes = filterField(object.notes); if (notes != null && !notes.equals(shortName)) { converted.setNotes(filterNotesField(object.notes)); } String url = filterField(object.url); if (url != null) { converted.addLink(new Link(url, url)); } return converted; } public static class XCollective { public String tempid; public String email; public String location_id; // needed for relation, ignore public String location_placename; // store temporarily public String name; // 4 entries without a name, but they occur in relations public String notes; // used. how do we handle whitespace? public int old_id; // ignore public String origin; // seems to be country. isn't this implied by location? public String original_field; // ignore public String original_table; // ignore public String short_name; public String telephone; public String type; // 'library', 'membership' public String url; } // --- Documents ------------------------------------------------------------- private static final String JUNK_SOURCE = "Info in Reference and/or Provional notes."; private static final String DOC_SOURCE_TYPE = "docSourceType"; private int importSourceDocuments() throws Exception { int initialSize = references.size(); Reference relationTypeRef = relationTypes.get("hasSourceCategory"); List<String> ignoredCategories = Lists.newArrayList("", "-", "TBD"); LineIterator iterator = getLineIterator("documents.json"); String line = ""; try { while (iterator.hasNext()) { line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { String json = preprocessDocumentJson(line); XDocument object = objectMapper.readValue(json, XDocument.class); if (object != null && object.source != null) { String title = filterField(object.source.full_name); String key = newKey("SourceDocument", title); if (title != null && !title.startsWith(JUNK_SOURCE) && !references.containsKey(key)) { WWDocument document = new WWDocument(); document.setSource(true); document.setTitle(title); document.setNotes(filterNotesField(object.source.notes)); String storedId = addDomainEntity(WWDocument.class, document); Reference documentRef = new Reference(Document.class, storedId); references.put(key, documentRef); String category = StringUtils.trimToEmpty(object.source.type); Reference keywordRef = keywords.lookup(DOC_SOURCE_TYPE, category); if (keywordRef != null) { addRelation(WWRelation.class, relationTypeRef, documentRef, keywordRef, change, ""); } else if (!ignoredCategories.contains(category)) { System.out.printf("Undefined source category [%s] for [%s]%n", category, title); } } } } } } catch (JsonMappingException e) { System.out.println(line); throw e; } finally { LineIterator.closeQuietly(iterator); } return references.size() - initialSize; } private PublisherNormalizer publisherNormalizer; private int importRegularDocuments() throws Exception { int initialSize = references.size(); documentTypeMap = createDocumentTypeMap(); File file = new File(inputDir, "publishers.txt"); publisherNormalizer = new PublisherNormalizer(file); LineIterator iterator = getLineIterator("documents.json"); String line = ""; try { while (iterator.hasNext()) { line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { handleDocument(preprocessDocumentJson(line)); } } } catch (JsonMappingException e) { System.out.println(line); throw e; } finally { LineIterator.closeQuietly(iterator); publisherNormalizer = null; } return references.size() - initialSize; } private Map<String, DocumentType> documentTypeMap; private Map<String, DocumentType> createDocumentTypeMap() { Map<String, DocumentType> map = Maps.newHashMap(); map.put("anthology", DocumentType.ANTHOLOGY); map.put("article", DocumentType.ARTICLE); map.put("award", DocumentType.AWARD); map.put("catalogue", DocumentType.CATALOGUE); map.put("list", DocumentType.LIST); map.put("picture", DocumentType.PICTURE); map.put("publicity", DocumentType.PUBLICITY); map.put("sheet music", DocumentType.SHEETMUSIC); map.put("tbd", DocumentType.UNKNOWN); map.put("theater script", DocumentType.THEATERSCRIPT); map.put("to be done", DocumentType.UNKNOWN); map.put("work", DocumentType.WORK); return map; } private String preprocessDocumentJson(String text) { text = text.replaceAll("\"libraries\" : \"\"", "\"libraries\" : null"); text = text.replaceAll("\"prints\" : \"\"", "\"prints\" : null"); text = text.replaceAll("\"source\" : \"\"", "\"source\" : null"); text = text.replaceAll("\"subject\" : \"\"", "\"subject\" : null"); text = text.replaceAll("\"subject\" : \\[\\]", "\"subject\" : null"); text = text.replaceAll("\"topoi\" : \"\"", "\"topoi\" : null"); text = text.replaceAll("\"topoi\" : \"\"", "\"topoi\" : null"); text = text.replaceAll("\"type\" : \"\"", "\"type\" : \"TBD\""); text = text.replaceAll("\"url\" : \"\", \"url_title\" : \"\"", "\"urls\" : null"); text = text.replaceAll("\"urls\" : \"\"", "\"urls\" : null"); return text; } private void handleDocument(String json) throws Exception { XDocument object = objectMapper.readValue(json, XDocument.class); String key = newKey("Document", object.tempid); if (references.containsKey(key)) { handleError("Duplicate key %s", key); } else { WWDocument converted = convert(json, object); if (converted != null) { String storedId = addDomainEntity(WWDocument.class, converted); Reference reference = storeReference(key, WWDocument.class, storedId); handlePublisher(extractPrints(object), converted.getDate(), reference); if (object.source != null) { String title = filterField(object.source.full_name); Reference sourceDocRef = references.get(newKey("SourceDocument", title)); if (sourceDocRef != null) { Reference relationTypeRef = getRelationTypeRef("hasDocumentSource", true); addRelation(WWRelation.class, relationTypeRef, reference, sourceDocRef, change, ""); } } } } } private WWDocument convert(String line, XDocument object) { WWDocument converted = new WWDocument(); converted.setResourceType(Document.ResourceType.TEXT); String type = filterField(object.type); converted.setDocumentType(toDocumentType(type)); String title = filterField(object.title); converted.setTitle(title != null ? title : "[Untitled]"); converted.setDescription(filterField(object.description)); String date = filterField(object.date); if (date != null) { try { converted.setDate(new Datable(date)); } catch (RuntimeException e) { handleError("Illegal 'date' in %s", line); } } converted.tempOrigin = filterField(object.origin); converted.setReference(filterField(object.reference)); // the keywords are not normalized: identical topoi occur as different items if (object.topoi != null && object.topoi.length != 0) { for (String[] topos : object.topoi) { if (topos[0] != null) { converted.addTopos(topos[0]); } } } StringBuilder notesBuilder = new StringBuilder(); Text.appendTo(notesBuilder, filterNotesField(object.notes), ""); List<XPrint> prints = extractPrints(object); for (XPrint print : prints) { Text.appendTo(notesBuilder, "* Print", NEWLINE); Text.appendTo(notesBuilder, print.edition, NEWLINE + "Edition: "); Text.appendTo(notesBuilder, print.publisher, NEWLINE + "Publisher: "); Text.appendTo(notesBuilder, print.location, NEWLINE + "Location: "); Text.appendTo(notesBuilder, print.year, NEWLINE + "Year: "); } if (selectFirstEdition(prints, date) != null) { converted.setEdition("1"); } converted.setNotes(notesBuilder.toString()); if (object.urls != null) { for (Map.Entry<String, String> entry : object.urls.entrySet()) { String label = filterField(entry.getKey()); String url = filterField(entry.getValue()); converted.addLink(new Link(url, label)); } } converted.tempCreator = filterField(object.creator); converted.tempLanguage = filterField(object.language); if (object.old_id != 0) { converted.tempOldId = String.format("%s/%d", object.original_table, object.old_id); } return converted.isValid() ? converted : null; } public DocumentType toDocumentType(String type) { DocumentType documentType = (type != null) ? documentTypeMap.get(type.toLowerCase()) : null; return (documentType != null) ? documentType : DocumentType.UNKNOWN; } private List<XPrint> extractPrints(XDocument object) { List<XPrint> prints = Lists.newArrayList(); if (object.prints != null && object.prints.size() != 0) { // order by key Map<String, XPrint> temp = Maps.newTreeMap(); temp.putAll(object.prints); for (Map.Entry<String, XPrint> entry : temp.entrySet()) { XPrint filtered = new XPrint(); filtered.edition = filterField(entry.getValue().edition); filtered.publisher = filterField(entry.getValue().publisher); filtered.location = filterField(entry.getValue().location); filtered.year = filterField(entry.getValue().year); prints.add(filtered); } } return prints; } private static final String IS_PUBLISHED_BY = "isPublishedBy"; private void handlePublisher(List<XPrint> prints, Datable datable, Reference documentRef) throws ValidationException { String date = (datable != null) ? datable.toString() : null; XPrint first = selectFirstEdition(prints, date); if (first != null) { String name = first.publisher; if (name != null && !name.isEmpty()) { name = publisherNormalizer.normalize(name); if (name != null && !name.isEmpty()) { String key = newKey("Publisher", name); Reference publisherRef = references.get(key); if (publisherRef == null) { WWCollective collective = new WWCollective(); collective.setType(Collective.Type.PUBLISHER); collective.setName(name); collective.tempLocationPlacename = first.location; String storedId = addDomainEntity(WWCollective.class, collective); publisherRef = storeReference(key, WWCollective.class, storedId); } Reference relationRef = relationTypes.get(IS_PUBLISHED_BY); addRelation(WWRelation.class, relationRef, documentRef, publisherRef, change, ""); } } } } private XPrint selectFirstEdition(List<XPrint> prints, String year) { XPrint selected = null; if (prints != null && !prints.isEmpty() && year != null) { int best = 0; for (XPrint print : prints) { int score = (edition(print) == 1) ? 1 : 0; if (year.equals(print.year)) { score++; } if (score == best) { // not unique selected = null; } else if (score > best) { selected = print; best = score; } } } return selected; } private int edition(XPrint print) { int n = 0; String text = print.edition; if (text != null) { for (char ch : text.toCharArray()) { if (Character.isDigit(ch)) { n = 10 * n + (ch - '0'); } else break; } } return n; } public static class XDocument { public String tempid; public String creator; // ignore public String creator_id; // object id of creator - must occur in relations public String date; // convert to datable public String description; // text public String language; // ignore public String language_id; // object id of language - must occur in relations public String[] libraries; // list of library id's - must occur in relations public String notes; // text public int old_id; // record number in NEWW database public String origin; // item of a list of countries. BUT origin of what? public String original_table; // table in NEWW database public Map<String, XPrint> prints; // printed editions public String reference; // text, sparse, unstructured public XSource source; public String[][] subject; // a list of subject keywords (text, id) - must occur in relations public String title; // text public String[][] topoi; // a list of topoi keywords (text, id) - must occur in relations public String type; // 'Article', 'Catalogue', 'List', 'Picture', 'Publicity', 'TBD', 'Work' public Map<String, String> urls; // convert to Link public String url; public String url_title; // convert to Link } public static class XSource { public String notes; public String type; public String full_name; public String short_name; @Override public String toString() { return String.format("%s | %s | %s | %s%n", notes, type, full_name, short_name); } } public static class XPrint { public String edition; public String publisher; public String location; public String year; } // --- Keywords -------------------------------------------------------------- // Used for handling multiple occurrences of key values private final Map<String, String> keywordValueIdMap = Maps.newHashMap(); private final Set<String> toposIds = Sets.newHashSet(); private int importKeywords() throws Exception { int initialSize = references.size(); LineIterator iterator = getLineIterator("keywords.json"); try { while (iterator.hasNext()) { String line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { handleKeyword(line); } } } finally { LineIterator.closeQuietly(iterator); } return references.size() - initialSize; } private void handleKeyword(String json) throws Exception { XKeyword object = objectMapper.readValue(json, XKeyword.class); String key = newKey("Keyword", object.tempid); if (references.containsKey(key)) { handleError("Duplicate key %s", key); } else { WWKeyword converted = convert(json, object); if (converted != null) { String value = converted.getValue(); String storedId = keywordValueIdMap.get(value); if (storedId == null) { storedId = addDomainEntity(WWKeyword.class, converted); keywordValueIdMap.put(value, storedId); } storeReference(key, WWKeyword.class, storedId); } } } private WWKeyword convert(String line, XKeyword object) { WWKeyword converted = new WWKeyword(); converted.setType(filterField(object.type)); verifyNonEmptyField(line, "type", converted.getType()); converted.setValue(filterField(object.keyword)); verifyNonEmptyField(line, "keyword", converted.getValue()); // We only store keywords of type "genre" // Relations with keywords are of type "hasGenre" if ("topos".equals(converted.getType())) { invalids.add(newKey("Keyword", object.tempid)); toposIds.add(object.tempid); return null; } else if (!"genre".equals(converted.getType())) { handleError("Unexpected type", converted.getType()); converted.setValue(null); } return converted.isValid() ? converted : null; } public static class XKeyword { public String tempid; public String keyword; public int old_id; // ignore public String original_table; // ignore public String type; } // --- Languages ------------------------------------------------------------- // The imported data contains ISO639-2 codes and names that are not mapped. // The simplest strategy is to implement our own mapping from name to code. /** * Creates a mapping from language names to ISO639-3 language codes. */ private Map<String, String> createNameCodeMap() { Map<String, String> map = Maps.newHashMap(); map.put("Albanian", "sqi"); map.put("Arabic", "ara"); map.put("Armenian", "hye"); map.put("Atticism", "ell"); map.put("Basque", "eus"); map.put("Breton", "bre"); map.put("Bulgarian", "bul"); map.put("Catalan", "cat"); map.put("Chinese", "zho"); map.put("Croatian", "hrv"); map.put("Czech", "ces"); map.put("Danish", "dan"); map.put("Dutch", "nld"); map.put("English", "eng"); map.put("Esperanto", "epo"); map.put("Estonian", "est"); map.put("Finnish", "fin"); map.put("French", "fra"); map.put("Frisian", "fry"); map.put("Galician", "glg"); map.put("German", "deu"); map.put("Greek", "ell"); map.put("Hebrew", "hbo"); map.put("Hungarian", "hun"); map.put("Icelandic", "isl"); map.put("Irish Gaelic", "gle"); map.put("Italian", "ita"); map.put("Japanese", "jpn"); map.put("Latin", "lat"); map.put("Lithuanian", "lit"); map.put("Norwegian", "nno"); map.put("Occitan", "oci"); map.put("Ottoman Turkish", "ota"); map.put("Persian", "fas"); map.put("Polish", "pol"); map.put("Portuguese", "por"); map.put("Romanian", "ron"); map.put("Russian", "rus"); map.put("Serbian", "srp"); map.put("Slavo-Serbian", "srp"); map.put("Slovak", "slk"); map.put("Slovenian", "slv"); map.put("Sorbian language", "srp"); map.put("Spanish", "spa"); map.put("Swedish", "swe"); map.put("Turkish", "tur"); map.put("Ukrainian", "ukr"); map.put("Uzbek", "uzb"); return map; } private String mapName(Map<String, String> map, String name) { if (Strings.isNullOrEmpty(name)) { return "?"; } else { String code = map.get(name); return (code != null) ? code : "?"; } } private int importLanguages() throws Exception { int initialSize = references.size(); Map<String, String> map = createNameCodeMap(); LineIterator iterator = getLineIterator("languages.json"); String line = ""; try { System.out.printf("\"language\",\"ISO-code\",\"ISO-name\"%n"); while (iterator.hasNext()) { line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { XLanguage object = objectMapper.readValue(line, XLanguage.class); String name = verifyNonEmptyField(line, "name", filterField(object.name)); if (name == null) { invalids.add(newKey("Language", object.tempid)); } else { String code = mapName(map, name); // Get WWLanguage instance with values of primitive entity Language WWLanguage language = repository.getLanguageByCode(WWLanguage.class, code); if (language == null) { verifyNonEmptyField(line, "name", null); } else { String flag = name.equals(language.getName()) ? "" : " *"; System.out.printf("%-30s%-8s%-30s%s%n", name, language.getCode(), language.getName(), flag); language.setCore(true); // TODO prevent multiple updates for same language updateProjectDomainEntity(WWLanguage.class, language); String key = newKey("Language", object.tempid); storeReference(key, WWLanguage.class, language.getId()); } } } } } catch (JsonMappingException e) { System.out.println(line); throw e; } finally { LineIterator.closeQuietly(iterator); } return references.size() - initialSize; } public static class XLanguage { public String db_name; public String tempid; public String ISO639_1Code; public String ISO639_2Code; public String name; public int old_id; // ignore public String original_table; // ignore } // --- Locations ------------------------------------------------------------- private final Map<String, String> conc = Maps.newHashMap(); private void setupConc() { conc.put("alton, hampshire#england", "se:alton.eng"); conc.put("amsterdam#netherlands", "se:amsterdam.nld"); conc.put("amsterdam#null", "se:amsterdam.nld"); conc.put("andros#greece", "re:andros.grc"); conc.put("athens#greece", "se:athens.grc"); conc.put("barcelona#spain", "se:barcelona.esp"); conc.put("belgrade#null", "se:belgrade.srb"); conc.put("belgrade#serbia", "se:belgrade.srb"); conc.put("bergen#norway", "se:bergen.nor"); conc.put("berlin#germany", "se:berlin.deu"); conc.put("bialystok#poland", "se:bialystok.pol"); conc.put("bologna#italy", "se:bologna.ita"); conc.put("bruxelles#belgium/southern netherlands", "se:brussel.bel"); conc.put("bucharest#romania", "se:bucharest.rou"); conc.put("budapest#hungary", "se:budapest.hun"); conc.put("cambridge#england", "se:cambridge.eng"); conc.put("cesena#italy", "se:cesena.ita"); conc.put("copenhagen#denmark", "se:kobenhavn.dnk"); conc.put("den haag#netherlands", "se:den-haag.nld"); conc.put("deventer#netherlands", "se:deventer.nld"); conc.put("dublin#ireland", "se:dublin.irl"); conc.put("dublin#null", "se:dublin.irl"); conc.put("firenze#italy", "se:firenze.ita"); conc.put("gothenburg#sweden", "se:goteborg.swe"); conc.put("groningen#null", "se:groningen.nld"); conc.put("helsinki#finland", "se:helsinki.fin"); conc.put("ioannina#greece", "se:ioannina.grc"); conc.put("kampen#netherlands", "se:kampen.nld"); conc.put("kopenhagen#denmark", "se:kobenhavn.dnk"); conc.put("koper#slovenia", "se:koper.svn"); conc.put("krakow#poland", "se:krakow.pol"); conc.put("leeuwarden#netherlands", "se:leeuwarden.nld"); conc.put("leiden#netherlands", "se:leiden.nld"); conc.put("leiden#null", "se:leiden.nld"); conc.put("leipzig and frankfurt am main#germany", "IGNORE"); conc.put("lisboa#portugal", "se:lisbon.prt"); conc.put("lisbon#portugal", "se:lisbon.prt"); conc.put("ljubljana#slovenia", "se:ljubljana.svn"); conc.put("londen#england", "se:london.eng"); conc.put("london#england", "se:london.eng"); conc.put("lublin#poland", "se:lublin.pol"); conc.put("madrid#spain", "se:madrid.esp"); conc.put("middelburg#netherlands", "se:middelburg.nld"); conc.put("milan#italy", "se:milano.ita"); conc.put("milano#italy", "se:milano.ita"); conc.put("milano#italy", "se:milano.ita"); conc.put("milano#null", "se:milano.ita"); conc.put("mlndal#sweden", "se:molndal.swe"); conc.put("moscow#russia", "se:moscow.rus"); conc.put("mnchen#germany", "se:munchen.deu"); conc.put("napoli#italy", "se:napoli.ita"); conc.put("nicosia#null", "se:nicosia.cyp"); conc.put("nijmegen#netherlands", "se:nijmegen.nld"); conc.put("northfield, minnesota#united states", "se:northfield.usa"); conc.put("novi sad#serbia", "se:novi-sad.srb"); conc.put("null#albania", "co:alb"); conc.put("null#argentina", "co:arg"); conc.put("null#australia", "co:aus"); conc.put("null#austria", "co:aut"); conc.put("null#austro-hungarian empire", "bl:emp-austro-hungarian"); conc.put("null#bohemia", "re:bohemia.cze"); conc.put("null#belgium/southern netherlands", "co:bel"); conc.put("null#bosnia and herzegovina", "co:bih"); conc.put("null#brasil", "co:bra"); conc.put("null#bulgaria", "co:bgr"); conc.put("null#byzantine empire", "bl:emp-byzantine"); conc.put("null#canada", "co:can"); conc.put("null#castilia", "re:castilia.esp"); conc.put("null#china", "co:chn"); conc.put("null#colombia", "co:col"); conc.put("null#colonies of european countries", "IGNORE"); conc.put("null#croatia", "co:hrv"); conc.put("null#czech republic", "co:cze"); conc.put("null#denmark", "co:dnk"); conc.put("null#egypt", "co:egy"); conc.put("null#england", "co:eng"); conc.put("null#england", "co:eng"); conc.put("null#estonia", "co:est"); conc.put("null#finland", "co:fin"); conc.put("null#france", "co:fra"); conc.put("null#germany", "co:deu"); conc.put("null#greece", "co:grc"); conc.put("null#hungary", "co:hun"); conc.put("null#iceland", "co:isl"); conc.put("null#india", "co:ind"); conc.put("null#ireland", "co:irl"); conc.put("null#italy", "co:ita"); conc.put("null#italy", "re:emilia-romagna.ita"); conc.put("null#japan", "co:jpn"); conc.put("null#kingdom of sardinia - piedmont", "IGNORE"); conc.put("null#latvia", "co:lva"); conc.put("null#lithuania", "co:ltu"); conc.put("null#luxemburg", "co:lux"); conc.put("null#mexico", "co:mex"); conc.put("null#montenegro", "co:mne"); conc.put("null#netherlands", "co:nld"); conc.put("null#norway", "co:nor"); conc.put("null#null", "IGNORE"); conc.put("null#ottoman empire", "bl:emp-ottoman"); conc.put("null#poland", "co:pol"); conc.put("null#portugal", "co:prt"); conc.put("null#romania", "co:rou"); conc.put("null#russia", "co:rus"); conc.put("null#scotland", "co:sco"); conc.put("null#serbia", "co:srb"); conc.put("null#slovakia", "co:svk"); conc.put("null#slovenia", "co:svn"); conc.put("null#south-america (to be specified)", "bl:south-america"); conc.put("null#spain", "co:esp"); conc.put("null#sweden", "co:swe"); conc.put("null#switzerland", "co:che"); conc.put("null#turkey", "co:tur"); conc.put("null#ukraine", "co:ukr"); conc.put("null#united states", "co:usa"); conc.put("null#unknown / not relevant", "IGNORE"); conc.put("null#uzbekistan", "co:uzb"); conc.put("null#wales", "re:wales.gbr"); conc.put("oslo#norway", "se:oslo.nor"); conc.put("oviedo#spain", "se:oviedo.esp"); conc.put("oxford#england", "se:oxford.eng"); conc.put("parijs#france", "se:paris.fra"); conc.put("parijs#null", "se:paris.fra"); conc.put("paris#france", "se:paris.fra"); conc.put("patra#greece", "se:patras.grc"); conc.put("patras#greece", "se:patras.grc"); conc.put("petersburg#russia", "se:saint-petersburg.rus"); conc.put("porto#portugal", "se:porto.prt"); conc.put("pozna, poznan#poland", "se:poznan.pol"); conc.put("pozna#poland", "se:poznan.pol"); conc.put("praha 1#czech republic", "se:prague.cze"); conc.put("rethymno#greece", "se:rethymno.grc"); conc.put("rotterdam#null", "se:rotterdam.nld"); conc.put("san francisco#united states", "se:san-francisco.usa"); conc.put("sofia#bulgaria", "se:sofia.bgr"); conc.put("st. petersburg#russia", "se:saint-petersburg.rus"); conc.put("st.petersburg#russia", "se:saint-petersburg.rus"); conc.put("the hague#netherlands", "se:den-haag.nld"); conc.put("thessaloniki#greece", "se:thessaloniki.grc"); conc.put("tilburg#netherlands", "se:tilburg.nld"); conc.put("toledo#spain", "se:toledo.esp"); conc.put("torino#italy", "se:torino.ita"); conc.put("turku#finland", "se:turku.fin"); conc.put("university of helsinki#finland", "se:helsinki.fin"); conc.put("utrecht#null", "se:utrecht.nld"); conc.put("vatican city#italy", "co:vat"); conc.put("volda#norway", "se:volda.nor"); conc.put("warsaw#poland", "se:warszawa.pol"); conc.put("warszawa#poland", "se:warszawa.pol"); conc.put("washington, dc#united states", "se:washington.usa"); conc.put("xxx#brasil", "co:bra"); conc.put("warszawa#poland", "se:warszawa.pol"); } private int importLocations() throws Exception { setupConc(); int initialSize = references.size(); LineIterator iterator = getLineIterator("locations.json"); try { while (iterator.hasNext()) { String line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { handleLocation(preprocessLocation(line)); } } } finally { conc.clear(); LineIterator.closeQuietly(iterator); } return references.size() - initialSize; } private String preprocessLocation(String text) { return text; } private void handleLocation(String json) throws Exception { XLocation object = objectMapper.readValue(json, XLocation.class); String key = newKey("Location", object.tempid); if (references.containsKey(key)) { handleError("Duplicate key %s", key); return; } WWLocation converted = convert(json, object); if (converted == null) { invalids.add(key); return; } String code = String.format("%s#%s", converted.tempSettlement, converted.tempCountry); String urn = conc.get(code.toLowerCase()); if (urn == null) { System.out.println(".. Adding new location: " + code); String storedId = addDomainEntity(WWLocation.class, converted); storeReference(key, WWLocation.class, storedId); return; } if (urn.equals("IGNORE")) { System.out.println(".. Ignoring location: " + code); invalids.add(key); return; } Location location = repository.findEntity(Location.class, Location.URN, urn); if (location == null) { handleError("URN not found: %s", urn); return; } converted.setId(location.getId()); converted.setRev(location.getRev()); converted.setUrn(location.getUrn()); converted.setDefLang(location.getDefLang()); converted.setNames(location.getNames()); converted.setLatitude(location.getLatitude()); converted.setLongitude(location.getLongitude()); updateProjectDomainEntity(WWLocation.class, converted); storeReference(key, WWLocation.class, location.getId()); } private WWLocation convert(String line, XLocation object) { WWLocation converted = new WWLocation(); verifyEmptyField(line, "bloc", object.bloc); verifyEmptyField(line, "district", object.district); verifyEmptyField(line, "geogName", object.geogName); verifyEmptyField(line, "houseNumber", object.houseNumber); verifyEmptyField(line, "latitude", object.latitude); verifyEmptyField(line, "longitude", object.longitude); verifyEmptyField(line, "notes", object.notes); verifyEmptyField(line, "period", object.period); verifyEmptyField(line, "region", object.region); converted.tempAddress = filterField(object.address); converted.tempSettlement = filterField(object.settlement); converted.tempCountry = filterField(object.country); converted.tempZipcode = filterField(object.zipcode); if (converted.isValid()) { return converted; } else { handleError("Empty location: ", line); return null; } } private static class XLocation { public String tempid; public String address; public String bloc; // EMPTY public String country; public String district; // EMPTY public String geogName; // contains settlement public String houseNumber; // EMPTY public String latitude; // EMPTY public String longitude; // EMPTY public String notes; // EMPTY public String period; // EMPTY public String region; // EMPTY public String settlement; // EMPTY public String zipcode; // 54 values } // --- Persons --------------------------------------------------------------- private Map<String, String> ckccMap; // Concordance with CKCC persons public Map<String, String> ckccConcordance() { Map<String, String> map = Maps.newHashMap(); map.put("authors/354", "roemers-visscher.anna.1583-1651"); map.put("authors/353", "roemers-visscher.maria-tesselschade.1594-1649"); map.put("authors/486", "schurman.anna-maria.1607-1678"); map.put("authors/1990", "reigersberch.suzanna.1586-1640"); map.put("authors/2199", "reigersberch.maria.1589-1653"); return map; } private final Pattern simpleNamePattern = Pattern.compile("^(\\p{Lu}\\p{L}+), (\\p{Lu}\\p{L}+)$"); private final Set<String> excludedNames = Sets.newHashSet("Comtesse", "Madame", "Madamoiselle", "Mejuffrouw", "Mevrouw", "Mme", "Mrs", "Queen", "Vrou"); private final Set<String> ignoredValues = Sets.newHashSet("not relevant", "not yet checked", "not yet checke", "not yet known", "seems impossible to know", "to be specified", "unknown", "unkown"); // maps line without id to stored id private final Map<String, String> lines = Maps.newHashMap(); private int nUnknown = 0; private int nArchetype = 0; private int nAuthor = 0; private int nPseudonym = 0; private int nDuplicates = 0; private int importPersons(Set<String> collaboratorIds) throws Exception { int initialSize = references.size(); ckccMap = ckccConcordance(); LineIterator iterator = getLineIterator("persons.json"); String line = ""; try { while (iterator.hasNext()) { line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { handlePerson(preprocessPerson(line), collaboratorIds); } } } catch (JsonMappingException e) { System.out.println(line); throw e; } finally { LineIterator.closeQuietly(iterator); } for (String type : types) { System.out.printf("type %s%n", type); } lines.clear(); return references.size() - initialSize; } private String preprocessPerson(String text) { text = text.replaceAll("\"financials\" : \\[\\]", "\"health\" : null"); text = text.replaceAll("\"health\" : \\[\\]", "\"health\" : null"); text = text.replaceAll("\"languages\" : \"\"", "\"languages\" : null"); text = text.replaceAll("\"url\" : \"\"", "\"url\" : null"); return text; } private void handlePerson(String json, Set<String> collaboratorIds) throws Exception { XPerson object = objectMapper.readValue(json, XPerson.class); if (collaboratorIds.contains(object.tempid)) { log("Ignoring generated collaborator %s: %s%n", object.tempid, object.name); return; } String key = newKey("Person", object.tempid); if (references.containsKey(key)) { handleError("Duplicate key %s", key); } else { WWPerson converted = convert(json, object); if (converted == null) { handleError("Ignoring invalid record: %s", json); } else { String line = json.replaceFirst("\"tempid\"\\s*:\\s*\"[^\"]*\",", ""); String storedId = lines.get(line); if (storedId != null) { nDuplicates++; } else { storedId = storePerson(converted); lines.put(line, storedId); } Reference personRef = storeReference(key, WWPerson.class, storedId); handleXRelation(object.old_id, personRef, "hasEducation", "education", object.education); handleXRelation(object.old_id, personRef, "hasFinancialSituation", "financialSituation", object.financials); handleXRelation(object.old_id, personRef, "hasMaritalStatus", "maritalStatus", object.marital_status); handleXRelation(object.old_id, personRef, "hasProfession", "profession", object.professions); handleXRelation(object.old_id, personRef, "hasReligion", "religion", object.religion); handleXRelation(object.old_id, personRef, "hasSocialClass", "socialClass", object.social_class); } } } // For a limited number of cases we store the person as a variatiant of an existing CKCC person private String storePerson(WWPerson wwPerson) { String urn = wwPerson.tempOldId != null ? ckccMap.get(wwPerson.tempOldId) : null; if (urn != null) { CKCCPerson ckccPerson = repository.findEntity(CKCCPerson.class, "urn", urn); if (ckccPerson != null) { log("Updating %s with %s%n", urn, wwPerson.getTempName()); String storedId = ckccPerson.getId(); wwPerson.setId(storedId); wwPerson.setRev(ckccPerson.getRev()); updateProjectDomainEntity(WWPerson.class, wwPerson); return storedId; } } return addDomainEntity(WWPerson.class, wwPerson); } private void handleXRelation(int oldId, Reference baseRef, String relationName, String keywordType, String... values) { if (values == null) return; Set<String> items = Sets.newHashSet(); for (String value : values) { String text = filterField(value); if (text == null) continue; text = text.toLowerCase().replaceAll("[*\\.\\?,;]", "").trim(); if (text.startsWith("married to")) { items.add("married"); } else if (text.startsWith("translator from")) { items.add("translator"); } else if (text.startsWith("writer of")) { items.add("writer"); } else if (text.endsWith(" writer")) { items.add("writer"); } else if (text.equals("lady") || text.equals("in") || text.equals("waiting")) { items.add("lady-in-waiting"); } else if (text.equals("social") || text.equals("cultural activist")) { items.add("social-cultural activist"); } else if (text.length() > 0) { items.add(text); } } Reference relationTypeRef = relationTypes.get(relationName); for (String item : items) { Reference keywordRef = keywords.lookup(keywordType, item); if (keywordRef != null) { addRelation(WWRelation.class, relationTypeRef, baseRef, keywordRef, change, ""); } else if (!ignoredValues.contains(item)) { log("[%05d] Undefined %s [%s]%n", oldId, keywordType, item); } } } private WWPerson convert(String line, XPerson object) { String text; WWPerson converted = new WWPerson(); converted.setBibliography(filterField(object.bibliography)); text = filterField(object.born_in); if (text != null) { converted.tempBirthPlace = text; } converted.tempChildren = filterField(object.children); converted.tempCollaborations = concatenate(object.collaborations); text = filterField(object.dateOfBirth); if (text != null) { converted.setBirthDate(new Datable(text)); } text = filterField(object.dateOfDeath); if (text != null) { converted.setDeathDate(new Datable(text)); } converted.tempDeath = filterField(object.death); converted.tempFinancialSituation = filterField(object.financial_situation); verifyEmptyField(line, "financialSituation", object.financialSituation); if (object.fs_pseudonyms != null) { for (String item : object.fs_pseudonyms) { converted.addFsPseudonym(filterField(item)); } } text = filterField(object.gender); if (text == null) { converted.setGender(Person.Gender.UNKNOWN); } else if (text.equals("M")) { converted.setGender(Person.Gender.MALE); } else if (text.equals("F")) { converted.setGender(Person.Gender.FEMALE); } else if (text.equals("U")) { converted.setGender(Person.Gender.UNKNOWN); } else { handleError("Unknown gender: %s", text); converted.setGender(Person.Gender.UNKNOWN); } converted.setHealth(filterField(object.health)); // converted.tempLanguages = concatenate(object.languages); converted.setLivedIn(filterField(object.lived_in)); converted.tempMemberships = concatenate(object.memberships); converted.tempMotherTongue = filterField(object.mother_tongue); String name = filterField(object.name); converted.setTempName(name); if (name != null) { Matcher matcher = simpleNamePattern.matcher(name); if (matcher.matches()) { String surname = matcher.group(1); String forename = matcher.group(2); if (!excludedNames.contains(forename)) { converted.addName(PersonName.newInstance(forename, surname)); } } } converted.setNationality(filterField(object.nationality)); converted.setNotes(filterNotesField(object.notes)); converted.setPersonalSituation(filterField(object.personal_situation)); verifyEmptyField(line, "personalSituation", object.personalSituation); converted.tempPlaceOfBirth = concatenate(object.placeOfBirth); converted.tempDeathPlace = filterField(object.placeOfDeath); converted.tempPseudonyms = concatenate(object.pseudonyms); converted.tempPsChildren = concatenate(object.ps_children); // converted.tempPublishingLanguages = concatenate(object.publishing_languages); converted.setTempSpouse(filterField(object.spouse)); String type = filterField(object.type); if (converted.getTempName() != null && converted.getTempName().startsWith("~")) { converted.addType(Person.Type.ARCHETYPE); nArchetype++; } else if (type == null || type.equalsIgnoreCase("unknown")) { nUnknown++; } else if (type.equalsIgnoreCase("author")) { converted.addType(Person.Type.AUTHOR); nAuthor++; } else if (type.equalsIgnoreCase("pseudonym")) { converted.addType(Person.Type.PSEUDONYM); nPseudonym++; } else { handleError("Illegal type '%s'%n", type); } if (object.url != null) { for (Map.Entry<String, String> entry : object.url.entrySet()) { String label = filterField(entry.getKey()); String url = filterField(entry.getValue()); converted.addLink(new Link(url, label)); } } if (object.old_id != 0) { if ("pseudonym".equalsIgnoreCase(type)) { // Generated pseudonyms have the same id as the person they are derived from. // Make sure external datasets link to the persons, not to the pseudonyms. converted.tempOldId = String.format("%s/%d/pseudonym", object.original_table, object.old_id); } else { converted.tempOldId = String.format("%s/%d", object.original_table, object.old_id); } } return converted; } private String concatenate(String[] items) { StringBuilder builder = new StringBuilder(); if (items != null) { for (String item : items) { Text.appendTo(builder, filterField(item), "; "); } } return (builder.length() > 0) ? builder.toString() : null; } private final Set<String> types = Sets.newTreeSet(); protected static class XPerson { public String tempid; public String bibliography; // text public String born_in; // must be mapped to birthPlace public String children; // number of children public String[] collaborations; // seem to be references to persons public String dateOfBirth; // birth year public String dateOfDeath; // death year public String death; // unstructured public String[] education; // unstructured public String financial_situation; // INCORRECT public String financialSituation; // EMPTY public String[] financials; // sparse, unstructured public String[] fs_pseudonyms; // sparse, unstructured public String gender; // U, M, F public String health; // sparse, unstructured public String[] languages; //clarfy: spoken, published, ... public String lived_in; // sparse, unstructured public String marital_status; // sparse, unstructured public String[] memberships; // sparse, unstructured public String mother_tongue; // unstructured, how does this relate to languages? public String name; // unstructured public String nationality; // sparse, how does this relate to pace of birth? public String notes; // text public int old_id; // record number in NEWW database public String original_field; // ignore public String original_table; // table in NEWW database public String personal_situation; // unstructured public String personalSituation; // EMPTY public String[] placeOfBirth; // how can this be an array? public String placeOfDeath; // unstructured public String[] professions; public String[] ps_children; public String[] pseudonyms; public String[] publishing_languages; public String[] religion; public String[] social_class; public String spouse; // as relation public String spouse_id; // ignore public String type; // 'author', 'pseudonym', 'unknown' public Map<String, String> url; } // --- Relations ------------------------------------------------------------- // We won't be using the collaborators generated in the first processing step. // It turns out that the quality is bad. We end up with more work to correct // them than it wil be to enter them properly "by hand". private Set<String> collectCollaborators() throws Exception { Set<String> ids = Sets.newHashSet(); LineIterator iterator = getLineIterator("relations.json"); String line = ""; try { while (iterator.hasNext()) { line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { XRelation object = objectMapper.readValue(line, XRelation.class); String relationType = filterField(object.relation_type); if ("collaborated_with".equals(relationType)) { String leftObject = verifyNonEmptyField(line, "leftObject", filterField(object.leftObject)); String leftId = verifyNonEmptyField(line, "leftId", filterField(object.leftId)); if ("Person".equals(leftObject)) { ids.add(leftId); } } } } } catch (Exception e) { throw e; } finally { LineIterator.closeQuietly(iterator); } return ids; } // Maps from names used in json to registered relation type names private RelationTypeConcordance relationTypeConcordance; private RelationTypeConcordance getRelationTypeConcordance() throws Exception { File file = new File(inputDir, "relationtypes.txt"); return new RelationTypeConcordance(file); } public Set<String> names = Sets.newTreeSet(); private int missingRelationTypes = 0; private int unstoredRelations = 0; private Reference getReference(String type, String id) { if (type == null || type.isEmpty()) { return null; } else { String key = newKey(type, id); return references.get(key); } } private void importRelations() throws Exception { relationTypeConcordance = getRelationTypeConcordance(); LineIterator iterator = getLineIterator("relations.json"); String line = ""; try { while (iterator.hasNext()) { line = preprocessJson(iterator.nextLine()); if (!line.isEmpty()) { handleRelation(line); } } } catch (Exception e) { System.out.println(line); throw e; } finally { LineIterator.closeQuietly(iterator); relationTypeConcordance = null; } } private void handleRelation(String line) throws Exception { XRelation object = objectMapper.readValue(line, XRelation.class); // First verify that essential fields are non-empty String relationType = filterField(object.relation_type); if (relationType == null) { if (++missingRelationTypes <= 5) { verifyNonEmptyField(line, "relation_type", relationType); } return; } // Ignore collaborations; see method collectCollaborators if ("collaborated_with".equals(relationType)) { return; } String leftObject = verifyNonEmptyField(line, "leftObject", filterField(object.leftObject)); String leftId = verifyNonEmptyField(line, "leftId", filterField(object.leftId)); String rightObject = verifyNonEmptyField(line, "rightObject", filterField(object.rightObject)); String rightId = verifyNonEmptyField(line, "rightId", filterField(object.rightId)); if (leftObject == null || leftId == null || rightObject == null || rightId == null) { return; } // Keep track of types seen names.add(relationType + ":" + leftObject + "-->" + rightObject); // Map to registered relation type and validate RelationTypeConcordance.Mapping mapping = relationTypeConcordance.lookup(relationType, leftObject, rightObject); if (mapping == null) { handleError("No relation type for (%s, %s, %s)", relationType, leftObject, rightObject); return; } if (mapping.newName.equalsIgnoreCase("ILLEGAL")) { // skip this one, append to log file return; } if (mapping.inverse) { String tempObject = leftObject; leftObject = rightObject; rightObject = tempObject; String tempId = leftId; leftId = rightId; rightId = tempId; } // Now we should be able to retrieve the reference to the relation type Reference relationRef = relationTypes.get(mapping.newName); if (relationRef == null) { handleError("No relation type for '%s' (derived from '%s')", mapping.newName, mapping.oldName); return; } // Obtain references to source and target entities Reference sourceRef = getReference(leftObject, leftId); if (sourceRef == null) { if (!invalids.contains(newKey(leftObject, leftId))) { handleError("No source reference for '%s-%s'", leftObject, leftId); } return; } Reference targetRef = getReference(rightObject, rightId); if (targetRef == null) { if (!invalids.contains(newKey(rightObject, rightId))) { handleError("No target reference for '%s-%s'", rightObject, rightId); } return; } // Some more validation, to check Json lives up to our expectations verifyEmptyField(line, "canonizing", filterField(object.canonizing)); verifyEmptyField(line, "certainty", filterField(object.certainty)); //verifyEmptyField( line, "child_female", filterTextField(object.child_female)); //verifyEmptyField( line, "child_male", filterTextField(object.child_male)); verifyEmptyField(line, "notes", filterField(object.notes)); //verifyEmptyField( line, "parent_female", filterTextField(object.parent_female)); //verifyEmptyField( line, "parent_male", filterTextField(object.parent_male)); verifyEmptyField(line, "qualification", filterField(object.qualification)); // Finally we're ready to store String storedId = null; try { storedId = addRelation(WWRelation.class, relationRef, sourceRef, targetRef, change, line); } catch (Exception e) { LOG.error("Failed to store: {}", line); System.exit(-1); } if (storedId == null) { if (++unstoredRelations <= 5) { handleError("Not stored.. %s", line); } return; } // Once the relation is in place, we update the project specific attributes // WWRelation relation = storageManager.getEntity(WWRelation.class, storedId); // relation.setQualification(Qualification.UNKNOWN); // updateDomainEntity(WWRelation.class, relation); } protected static class XRelation { public String tempid; public String canonizing; // EMPTY public String certainty; // EMPTY public String child_female; public String child_male; public boolean isReception; public String leftId; public String leftName; public String leftObject; public String notes; // EMPTY public int old_id; public String original_table; public String parent_female; public String parent_male; public String qualification; // EMPTY public String reception_relation_type; // very sparse public String relation_type; // text public String rightId; public String rightName; public String rightObject; } }