ca.mcgill.cs.crown.data.WiktionaryReader.java Source code

Introduction

Here is the source code for ca.mcgill.cs.crown.data.WiktionaryReader.java
Source

/* 
 * This source code is subject to the terms of the Creative Commons
 * Attribution-NonCommercial-ShareAlike 4.0 license. If a copy of the BY-NC-SA
 * 4.0 License was not distributed with this file, You can obtain one at
 * https://creativecommons.org/licenses/by-nc-sa/4.0.
*/

package ca.mcgill.cs.crown.data;

import de.tudarmstadt.ukp.jwktl.JWKTL;

import de.tudarmstadt.ukp.jwktl.api.IWikiString;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEdition;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryRelation;
import de.tudarmstadt.ukp.jwktl.api.IWiktionarySense;
import de.tudarmstadt.ukp.jwktl.api.PartOfSpeech;
import de.tudarmstadt.ukp.jwktl.api.RelationType;

import de.tudarmstadt.ukp.jwktl.api.filter.WiktionaryEntryFilter;

import de.tudarmstadt.ukp.jwktl.api.util.Language;

import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import edu.ucla.sspace.util.LineReader;

import edu.stanford.nlp.util.CoreMap;

import ca.mcgill.cs.crown.LexicalEntry;
import ca.mcgill.cs.crown.LexicalEntryImpl;
import ca.mcgill.cs.crown.Relation;
import ca.mcgill.cs.crown.RelationImpl;
import ca.mcgill.cs.crown.CrownAnnotations;

import ca.mcgill.cs.crown.util.CrownLogger;
import ca.mcgill.cs.crown.util.WiktionaryUtils;

import edu.mit.jwi.item.POS;

/**
 * A class for converting Wiktionary data in several forms into a series of
 * {@link LexicalEntry} objects.
 */
public class WiktionaryReader {

    public List<LexicalEntry> loadFromDump(File wiktionaryXmlDump, File outputWiktionaryDir,
            File outputPreprocessedFile) throws IOException {

        // Sanity check that we're not needlessly extracting from the dump file
        // by seeing if the directory already contains the extracted data
        if (outputWiktionaryDir.exists() && outputWiktionaryDir.listFiles().length > 0) {
            try {
                IWiktionaryEdition wikt = JWKTL.openEdition(outputWiktionaryDir);
                CrownLogger.info("Loading Wiktionary data from " + "already-parsed result");
                List<JSONObject> rawEntries = extract(wikt, outputPreprocessedFile);
                return convertToEntries(rawEntries);

            } catch (Throwable t) {
                // Ignore because we'll process the dump file to get the data
                // anyway
            }
        }

        JWKTL.parseWiktionaryDump(wiktionaryXmlDump, outputWiktionaryDir, true);
        return loadFromDir(outputWiktionaryDir, outputPreprocessedFile);
    }

    public List<LexicalEntry> loadFromDir(File wiktionaryDir, File preprocessedOutputFile) {
        IWiktionaryEdition wikt = JWKTL.openEdition(wiktionaryDir);
        List<JSONObject> rawEntries = extract(wikt, preprocessedOutputFile);
        return convertToEntries(rawEntries);
    }

    public List<LexicalEntry> loadFromPreprocessed(File preprocessedFile) {
        List<JSONObject> rawEntries = new ArrayList<JSONObject>(500_000);
        for (String line : new LineReader(preprocessedFile)) {
            try {
                JSONObject rawEntry = new JSONObject(line);
                rawEntries.add(rawEntry);
            } catch (JSONException je) {
                throw new IOError(je);
            }
        }
        return convertToEntries(rawEntries);
    }

    private static List<LexicalEntry> convertToEntries(List<JSONObject> rawEntries) {

        // Avoid the potential for duplicates in the entries
        Set<String> alreadyIncluded = new HashSet<String>();
        int excluded = 0;

        List<LexicalEntry> entries = new ArrayList<LexicalEntry>();
        for (JSONObject jo : rawEntries) {
            try {
                String posStr = jo.getString("pos").toUpperCase();
                String lemma = jo.getString("lemma");
                String id = jo.getString("id");
                // Check for duplicates
                if (alreadyIncluded.contains(lemma + "." + posStr + ":" + id)) {
                    excluded++;
                    continue;
                }
                alreadyIncluded.add(lemma + ":" + id);
                LexicalEntry e = new LexicalEntryImpl(lemma, id, POS.valueOf(posStr));
                Set<String> glosses = new LinkedHashSet<String>();
                Map<String, String> rawGlossToCleaned = new LinkedHashMap<String, String>();

                JSONArray glossArr = jo.getJSONArray("glosses");
                for (int i = 0; i < glossArr.length(); ++i) {
                    String rawGloss = glossArr.getString(i);
                    String cleaned = WiktionaryUtils.cleanGloss(rawGloss);
                    glosses.add(cleaned);
                    rawGlossToCleaned.put(rawGloss, cleaned);
                }

                String combinedGloss = String.join(" ", glosses);

                List<Relation> relations = new ArrayList<Relation>();
                JSONArray relationsArr = jo.getJSONArray("relations");
                for (int i = 0; i < relationsArr.length(); ++i) {
                    JSONObject relObj = relationsArr.getJSONObject(i);
                    Relation rel = new RelationImpl(relObj.getString("targetLemma"),
                            relObj.optString("targetSense"),
                            Relation.RelationType.valueOf(relObj.getString("type")));
                    relations.add(rel);
                }

                CoreMap m = e.getAnnotations();
                m.set(CrownAnnotations.Gloss.class, combinedGloss);
                m.set(CrownAnnotations.Glosses.class, glosses);
                m.set(CrownAnnotations.RawGlosses.class, rawGlossToCleaned);
                m.set(CrownAnnotations.Relations.class, relations);

                entries.add(e);
            } catch (JSONException je) {
                throw new IOError(je);
            }
        }

        CrownLogger.verbose("Excluded %d duplicate entries", excluded);

        return entries;
    }

    private List<JSONObject> extract(IWiktionaryEdition wikt, File outputFile) {
        try {
            return extract_(wikt, outputFile);
        } catch (Exception e) {
            // Ugh... lazy
            throw new IOError(e);
        }
    }

    private List<JSONObject> extract_(IWiktionaryEdition wikt, File outputFile) throws IOException, JSONException {

        List<JSONObject> rawEntries = new ArrayList<JSONObject>(500_000);
        WiktionaryEntryFilter filter = new WiktionaryEntryFilter();
        filter.setAllowedWordLanguages(Language.ENGLISH);

        PrintWriter pw = null;
        if (outputFile != null)
            pw = new PrintWriter(outputFile);

        int i = 0;
        for (IWiktionaryEntry entry : wikt.getAllEntries(filter)) {
            PartOfSpeech pos = entry.getPartOfSpeech();
            POS pos_ = null;
            if (pos == null) {
                if (++i % 10_000 == 0)
                    CrownLogger.info("Processed %d entries", i);
                continue;
            }
            String lemma = entry.getWord();
            char posChar = 'n';
            switch (pos) {
            case NOUN:
            case PROPER_NOUN:
            case MEASURE_WORD:
                posChar = 'n';
                pos_ = POS.NOUN;
                break;
            case VERB:
                posChar = 'v';
                pos_ = POS.VERB;
                break;
            case ADJECTIVE:
                posChar = 'a';
                pos_ = POS.ADJECTIVE;
                break;
            case ADVERB:
                posChar = 'r';
                pos_ = POS.ADVERB;
                break;
            // We don't want to deal with other POS tags
            default:
                // System.out.printf("Skipping %s %s%n", lemma, pos);
                if (++i % 10_000 == 0)
                    CrownLogger.info("Processed %d entries", i);
                continue;
            }

            for (IWiktionarySense sense : entry.getSenses()) {
                IWikiString gloss = sense.getGloss();
                List<IWikiString> examples = sense.getExamples();
                if (examples == null)
                    examples = Collections.<IWikiString>emptyList();
                int senseNum = sense.getIndex();
                List<String> rawGlosses = Arrays.asList(gloss.getText().split("\n"));

                JSONObject rawEntry = new JSONObject();
                rawEntry.put("sense", lemma + "." + posChar + "." + senseNum);
                rawEntry.put("id", lemma + ":" + sense.getId());
                rawEntry.put("lemma", lemma);
                rawEntry.put("pos", pos_.toString());

                JSONArray glossArr = new JSONArray();
                for (String rawGloss : rawGlosses)
                    glossArr.put(rawGloss);
                rawEntry.put("glosses", glossArr);

                JSONArray examplesArr = new JSONArray();
                for (IWikiString example : examples)
                    examplesArr.put(example.getText());
                rawEntry.put("examples", examplesArr);

                JSONArray relArr = new JSONArray();
                List<IWiktionaryRelation> relations = sense.getRelations();
                if (relations == null)
                    relations = Collections.<IWiktionaryRelation>emptyList();
                for (IWiktionaryRelation rel : relations) {
                    JSONObject relObj = new JSONObject();
                    relObj.put("targetLemma", rel.getTarget());
                    relObj.put("targetSense", rel.getTargetSense());
                    relObj.put("type", rel.getRelationType().toString());
                    relArr.put(relObj);
                }
                rawEntry.put("relations", relArr);

                if (pw != null)
                    pw.println(rawEntry.toString());

                rawEntries.add(rawEntry);
                //System.out.printf("%s.%s.%d: %s %s%n", lemma, pos, senseNum, rawGlosses, examples);
            }

            if (++i % 10_000 == 0) {
                CrownLogger.info("Processed %d entries", i);
            }
        }
        wikt.close();
        if (pw != null)
            pw.close();
        return rawEntries;
    }

}