Java tutorial
/******************************************************************************* * Copyright (c) 2013, 2014 Matthew Purver, Queen Mary University of London. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Public License v3.0 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/gpl.html * * Contributors: * Matthew Purver, Queen Mary University of London - initial API and implementation ******************************************************************************/ package qmul.align; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import org.apache.poi.ss.usermodel.*; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.jfree.chart.ChartFactory; import org.jfree.chart.ChartPanel; import org.jfree.chart.JFreeChart; import org.jfree.chart.axis.AxisLocation; import org.jfree.chart.axis.NumberAxis; import org.jfree.chart.plot.CombinedDomainXYPlot; import org.jfree.chart.plot.PlotOrientation; import org.jfree.chart.plot.XYPlot; import org.jfree.chart.renderer.xy.StandardXYItemRenderer; import org.jfree.data.xy.XYSeries; import org.jfree.data.xy.XYSeriesCollection; import org.jfree.ui.ApplicationFrame; import org.jfree.ui.RefineryUtilities; import csli.util.dsp.Smoother; import csli.util.dsp.SmoothingFactory; import qmul.corpus.BNCCorpus; import qmul.corpus.CombinedCorpus; import qmul.corpus.DCPSECorpus; import qmul.corpus.Dialogue; import qmul.corpus.DialogueCorpus; import qmul.corpus.DialogueSentence; import qmul.corpus.DialogueSpeaker; import qmul.corpus.DialogueTurn; import qmul.corpus.DialogueUnit; import qmul.corpus.RandomCorpus; import qmul.corpus.SwitchboardCorpus; import qmul.util.ApacheStatistics; import qmul.util.MapUtil; import qmul.util.MapUtil.DescendingComparator; import qmul.util.MathUtil; import qmul.util.parse.CreateTreeFromDCPSE; import qmul.util.parse.CreateTreeFromSWBD; import qmul.util.similarity.SimilarityMeasure; import qmul.util.treekernel.TreeKernel; import qmul.window.DialogueWindower; import qmul.window.OtherSpeakerAllOtherSentenceWindower; import qmul.window.OtherSpeakerAllOtherTurnWindower; import qmul.window.OtherSpeakerSentenceWindower; import qmul.window.OtherSpeakerTurnWindower; import qmul.window.SameSpeakerAllOtherSentenceWindower; import qmul.window.SameSpeakerAllOtherTurnWindower; import qmul.window.SameSpeakerSentenceWindower; import qmul.window.SameSpeakerTurnWindower; import qmul.window.SentenceWindower; import qmul.window.TurnWindower; /** * A general class for similarity testing over a {@link DialogueCorpus} * * @author mpurver */ public class AlignmentTester<X extends DialogueUnit> { private static int numTestsRun = 0; // Appended to summary.xlsx to indicate test number. e.g: summary 1.xlsx private DialogueCorpus corpus; private SimilarityMeasure<X> sim; private DialogueWindower<X> win; private OutputStream xls = null; private boolean counts = false; private static final int NORM_NONE = 0; private static final int NORM_MEAN = 1; private static final int NORM_MAX = 2; private int normalisation = NORM_NONE; private Smoother smoother = SmoothingFactory.getSmoother("null"); /** * Default constructor - doesn't write XLS output to file */ public AlignmentTester() { super(); } /** * @param xlsFile * a file to write XLS output to */ public AlignmentTester(File xlsFile) { this(); try { xls = new FileOutputStream(xlsFile); } catch (IOException e) { e.printStackTrace(); } } /** * Process a single dialogue * * @param d * the dialogue to process * @param wb * the XLS workbook to write to, or null not to bother * @return a list of {@link Double} scores, one per {@link DialogueWindower} step (e.g. dialogue turn) */ public List<Double> processDialogue(Dialogue d, Workbook wb, HashMap<String, ArrayList<Double>> speakerScores, HashMap<String, String> originalSpks, HashMap<String, ArrayList<Double>> speakerN, MetricsMap spkMetrics, MetricsMap totMetrics, Workbook wbcounts, HashMap<String, HashMap<Object, Integer>> allCounts, HashMap<String, HashMap<Object, Integer>> commonCounts, HashMap<Object, Integer> diaAllCounts, HashMap<Object, Integer> diaCommonCounts) { CreationHelper creationHelper = wb.getCreationHelper(); win.setDialogue(d); sim.reset(); ArrayList<DialogueSpeaker> spks = new ArrayList<DialogueSpeaker>(d.getSpeakers()); Collections.sort(spks); Sheet sheet = (wb == null ? null : wb.createSheet(d.getId().replaceAll(":", "-"))); Sheet sheetcounts = (wbcounts == null ? null : wbcounts.createSheet(d.getId().replaceAll(":", "-"))); int iRow = 0; if (sheet != null) { iRow = writeSheetHeader(creationHelper, sheet, iRow, d, spks); } int iCRow = 0; if (sheetcounts != null) { iCRow = writeSheetHeader(creationHelper, sheet, iCRow, d, spks); } ArrayList<Double> scores = new ArrayList<Double>(); HashSet<X> counted = new HashSet<X>(); do { List<X> left = win.getLeftWindow(); Collections.reverse(left); // windowers return things in dialogue order: we'll look progressively backwards List<X> right = win.getRightWindow(); // System.out.println("lengthS " + left.size() + " " + right.size()); double score = 0.0; double n = 0.0; for (X r : right) { String spkKey = makeSpkKey(r.getSpeaker(), d); String originalSpkKey = ""; if (r.getOriginalSpeaker() != null) { originalSpkKey = r.getOriginalSpeaker().getId(); // fix for the fact that BNC speakers are not currently given SUBdialogue ID in their ID - TODO // change that? Dialogue od = r.getOriginalDialogue(); if ((od == null) && (r instanceof DialogueSentence)) { od = ((DialogueSentence) r).getTurn().getOriginalDialogue(); } String originalDia; if (od != null) { originalDia = od.getId(); } else { originalDia = d.getId().replaceFirst("-\\d+$", ""); } if (!originalSpkKey.contains(originalDia)) { if (!originalDia.contains(":")) { throw new RuntimeException("can't find super-dialogue, no : in " + originalDia); } String originalSuperDia = originalDia.substring(0, originalDia.lastIndexOf(":")); if (originalSpkKey.contains(originalSuperDia)) { originalSpkKey = originalSpkKey.replace(originalSuperDia, originalDia); } else { throw new RuntimeException("spk key without super-dialogue " + spkKey + ", " + originalSpkKey + ", " + originalDia); } } } Row row = (wb == null ? null : sheet.createRow(iRow++)); int iCol = 0; Cell cell = (wb == null ? null : row.createCell(iCol++, Cell.CELL_TYPE_STRING)); if (cell != null) { cell.setCellValue(creationHelper.createRichTextString(r.getSpeaker().getId())); cell = row.createCell(iCol++, Cell.CELL_TYPE_STRING); cell.setCellValue(creationHelper.createRichTextString(originalSpkKey)); // cell = row.createCell(iCol++, Cell.CELL_TYPE_STRING); // cell.setCellValue(creationHelper.createRichTextString(r.getId())); cell = row.createCell(iCol++, Cell.CELL_TYPE_STRING); cell.setCellValue(creationHelper.createRichTextString(r.toString())); row.setHeightInPoints(12); sheet.setColumnWidth(iCol - 1, 2560); } if (!speakerScores.containsKey(spkKey)) { speakerScores.put(spkKey, new ArrayList<Double>()); speakerN.put(spkKey, new ArrayList<Double>()); originalSpks.put(spkKey, originalSpkKey); for (int i = 0; i < win.getLeftWindowSize(); i++) { speakerScores.get(spkKey).add(0.0); speakerN.get(spkKey).add(0.0); } Boolean isTurns = null; if (left.size() > 0) { isTurns = (left.get(0) instanceof DialogueTurn); } else if (right.size() > 0) { isTurns = (right.get(0) instanceof DialogueTurn); } spkMetrics.setNumUnits(spkKey, 0); totMetrics.setNumUnits(d.getId(), (isTurns ? d.numTurns() : d.numSents())); spkMetrics.setNumWords(spkKey, 0); totMetrics.setNumWords(d.getId(), d.numWords()); spkMetrics.setNumTokens(spkKey, 0); totMetrics.setNumTokens(d.getId(), d.numTokens()); } int iLeft = 0; double offset = Double.NaN; boolean gotOffset = false; for (X l : left) { double s = sim.similarity(l, r); // System.out.println("Siml = " + s + " for l:" + l.getId() + " r:" + r.getId()); if ((l.getOriginalId() != null) && (r.getOriginalId() != null) && l.getOriginalId().equals(r.getOriginalId())) { System.out.println("Equal IDs sim = " + s + " for l:" + l.getId() + " " + l.getOriginalId() + " r:" + r.getId() + " " + r.getOriginalId() + " d " + d.getId() + " nturns " + d.numTurns()); } if (wbcounts != null) { if (!counted.contains(l)) { MapUtil.addAll(diaAllCounts, sim.rawCountsA()); MapUtil.addAll(allCounts.get(""), sim.rawCountsA()); MapUtil.addAll(allCounts.get(d.getGenre()), sim.rawCountsA()); counted.add(l); } if (!counted.contains(r)) { MapUtil.addAll(diaAllCounts, sim.rawCountsB()); MapUtil.addAll(allCounts.get(""), sim.rawCountsB()); MapUtil.addAll(allCounts.get(d.getGenre()), sim.rawCountsB()); counted.add(r); } MapUtil.addAll(diaCommonCounts, sim.rawCountsAB()); MapUtil.addAll(commonCounts.get(""), sim.rawCountsAB()); MapUtil.addAll(commonCounts.get(d.getGenre()), sim.rawCountsAB()); } cell = (wb == null ? null : row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC)); if (cell != null) { cell.setCellValue(s); } score += s; n++; speakerScores.get(spkKey).set(iLeft, speakerScores.get(spkKey).get(iLeft) + s); speakerN.get(spkKey).set(iLeft, speakerN.get(spkKey).get(iLeft) + 1); if (!win.getClass().toString().contains("AllOther")) { // for "all other" windowers, actually // average over "window" iLeft++; } if (!gotOffset) { offset = r.getStartTime() - l.getEndTime(); gotOffset = true; // if (!Double.isNaN(offset)) { // System.out.println("Offset = " + offset + " for l:" + l.getId() + " r:" + r.getId()); // } } } // print number sents/words/tokens iCol += (win.getLeftWindowSize() - left.size() + 1); if (wb != null) { // if we are writing to a workbook cell = (wb == null ? null : row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC)); cell.setCellValue(r instanceof DialogueTurn ? ((DialogueTurn) r).getSents().size() : 1); cell = (wb == null ? null : row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC)); cell.setCellValue(r.numWords()); cell = (wb == null ? null : row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC)); cell.setCellValue(r.numTokens()); } iCol += 1; if (!Double.isNaN(offset)) { cell = (wb == null ? null : row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC)); cell.setCellValue(offset); } else { iCol++; } double wordRate = (double) (r.getEndTime() - r.getStartTime()) / (double) r.numWords(); if (r.numWords() == 0) { wordRate = Double.NaN; // on some OSs this doesn't happen in the calc above } if (!Double.isNaN(wordRate)) { cell = (wb == null ? null : row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC)); cell.setCellValue(wordRate); } else { iCol++; } // make sure we counted this one - the first one can get missed if leftWindow empty if ((wbcounts != null) && !counted.contains(r)) { sim.similarity(r, r); MapUtil.addAll(diaAllCounts, sim.rawCountsA()); MapUtil.addAll(allCounts.get(""), sim.rawCountsA()); MapUtil.addAll(allCounts.get(d.getGenre()), sim.rawCountsA()); counted.add(r); } spkMetrics.setNumUnits(spkKey, spkMetrics.getNumUnits(spkKey) + 1); spkMetrics.setNumWords(spkKey, spkMetrics.getNumWords(spkKey) + r.numWords()); spkMetrics.setNumTokens(spkKey, spkMetrics.getNumTokens(spkKey) + r.numTokens()); if (!Double.isNaN(offset)) { spkMetrics.setTurnOffset(spkKey, spkMetrics.getTurnOffset(spkKey) + offset); spkMetrics.setNumTurnOffsets(spkKey, spkMetrics.getNumTurnOffsets(spkKey) + 1); totMetrics.setTurnOffset(d.getId(), totMetrics.getTurnOffset(d.getId()) + offset); totMetrics.setNumTurnOffsets(d.getId(), totMetrics.getNumTurnOffsets(d.getId()) + 1); } if (!Double.isNaN(wordRate)) { spkMetrics.setWordRate(spkKey, spkMetrics.getWordRate(spkKey) + wordRate); spkMetrics.setNumWordRates(spkKey, spkMetrics.getNumWordRates(spkKey) + 1); totMetrics.setWordRate(d.getId(), totMetrics.getWordRate(d.getId()) + wordRate); totMetrics.setNumWordRates(d.getId(), totMetrics.getNumWordRates(d.getId()) + 1); } } scores.add((n == 0.0) ? 0.0 : (score / n)); } while (win.advance()); if (wb != null) { iRow++; for (DialogueSpeaker spk : spks) { String spkKey = makeSpkKey(spk, d); Row row = sheet.createRow(iRow++); int iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(spk.getId())); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(originalSpks.get(spkKey))); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Mean")); for (int i = 0; i < win.getLeftWindowSize(); i++) { if (speakerN.get(spkKey).get(i) > 0) { row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC) .setCellValue(speakerScores.get(spkKey).get(i) / speakerN.get(spkKey).get(i)); } else { iCol++; } // System.out // .println("score " + i + " for speaker " + spkKey + "=" + speakerScores.get(spkKey).get(i)); // System.out.println("N " + i + " for speaker " + spkKey + "=" + speakerN.get(spkKey).get(i)); // System.out.println("mean " + i + " for speaker " + spkKey + "=" // + (speakerScores.get(spkKey).get(i) / speakerN.get(spkKey).get(i))); } iCol++; row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue( (double) spkMetrics.getNumUnits(spkKey) / (double) spkMetrics.getNumUnits(spkKey)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue( (double) spkMetrics.getNumWords(spkKey) / (double) spkMetrics.getNumUnits(spkKey)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue( (double) spkMetrics.getNumTokens(spkKey) / (double) spkMetrics.getNumUnits(spkKey)); iCol++; row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue( (double) spkMetrics.getTurnOffset(spkKey) / (double) spkMetrics.getNumTurnOffsets(spkKey)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue( (double) spkMetrics.getWordRate(spkKey) / (double) spkMetrics.getNumWordRates(spkKey)); } } if (wbcounts != null) { iCRow++; ArrayList<Object> keys = new ArrayList<Object>(diaAllCounts.keySet()); Collections.sort(keys, new DescendingComparator<Object>(diaAllCounts)); for (Object key : keys) { Row row = sheetcounts.createRow(iCRow++); int iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(key.toString())); Cell cell = row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC); if (diaAllCounts.get(key) != null) { cell.setCellValue(diaAllCounts.get(key)); } cell = row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC); if (diaCommonCounts.get(key) != null) { cell.setCellValue(diaCommonCounts.get(key)); } } } return scores; } private int writeSheetHeader(CreationHelper creationHelper, Sheet sheet, int iRow, Dialogue d, List<DialogueSpeaker> spks) { int iCol = 0; Row row = sheet.createRow(iRow++); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("ID")); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString(d.getId())); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num speakers")); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(d.numSpeakers()); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num turns")); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(d.numTurns()); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num sents")); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(d.numSents()); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num words")); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(d.numWords()); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num tok words")); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(d.numTokens()); iRow++; row = sheet.createRow(iRow++); iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Speaker")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Orig Speaker")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("First Name")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Last Name")); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Gender")); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Age")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Occupation")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Dialogue genre")); for (DialogueSpeaker s : spks) { // HACK to find first original speaker for speaker - will only work with consistent speaker pairing DialogueSpeaker os = null; for (DialogueTurn t : d.getTurns()) { if (t.getSpeaker().equals(s)) { os = t.getOriginalSpeaker(); break; } } row = sheet.createRow(iRow++); iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(s.getId())); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(os == null ? "" : os.getId())); s = (os == null ? s : os); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(s.getFirstName())); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(s.getLastName())); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(s.getGender())); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(s.getAge())); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(s.getOccupation())); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue( creationHelper.createRichTextString(corpus.getGenreMap().get(s.getId().split(":")[0]))); } iRow++; row = sheet.createRow(iRow++); iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Speaker")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Orig Speaker")); // row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Turn ID")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Transcription")); for (int i = 0; i < getWin().getLeftWindowSize(); i++) { row.createCell(iCol + i, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Val i-" + (i + 1))); } iCol += getWin().getLeftWindowSize() + 1; row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num sents")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num words")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Num tok words")); iCol += 1; row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Offset time")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Time per word")); return iRow; } /** * we actually want spkKey to represent the (sub-)dialogue/speaker pairing. In some corpora (e.g. DCPSE) speaker ID * is unique per (sub-)dialogue anyway, as we know so little about real-world speaker ID. In some (e.g. BNC), * speaker ID may not be unique per (sub-)dialogue, so make it that way * * @param spk * @param d * @return */ private String makeSpkKey(DialogueSpeaker spk, Dialogue d) { String spkKey = spk.getId(); String dId = d.getId(); if (!spkKey.contains(dId)) { String mainId = ""; if (dId.matches("^.*-\\d+$")) { mainId = dId.substring(0, dId.lastIndexOf("-")); if (!spkKey.startsWith(mainId)) { mainId = ""; } } if (mainId.isEmpty() && dId.contains(":")) { // avoid redundant info mainId = dId.substring(0, dId.indexOf(":")); } if (!mainId.isEmpty() && spkKey.contains(mainId)) { spkKey = spkKey.replaceFirst(mainId, dId); } else { spkKey = dId + ":" + spkKey; } } return spkKey; } /** * Process all dialogues in the corpus * * @return a list of lists of scores (one list per dialogue) */ public List<List<Double>> processCorpus(String runId) { ApacheStatistics stats = new ApacheStatistics(); Workbook wb = (xls == null ? null : new XSSFWorkbook()); Workbook wbcounts = (xls == null ? null : (counts ? new XSSFWorkbook() : null)); ArrayList<List<Double>> scores = new ArrayList<List<Double>>(); System.out.println("Similarity measure " + sim.getClass().getName() + ", windower " + win); System.out.println("Smoothing " + smoother + ", normalisation=" + normalisation); System.out.println( "Processing corpus " + corpus.getId() + " with " + corpus.numDialogues() + " dialogues ..."); HashMap<String, String> originalSpks = new HashMap<String, String>(); MetricsMap spkMetrics = new MetricsMap(); MetricsMap totMetrics = new MetricsMap(); HashMap<String, ArrayList<Double>> speakerScores = new HashMap<String, ArrayList<Double>>(); HashMap<String, ArrayList<Double>> speakerN = new HashMap<String, ArrayList<Double>>(); // maps from genres to maps from objects to integers HashMap<String, HashMap<Object, Integer>> allCounts = new HashMap<String, HashMap<Object, Integer>>(); HashMap<String, HashMap<Object, Integer>> commonCounts = new HashMap<String, HashMap<Object, Integer>>(); allCounts.put("", new HashMap<Object, Integer>()); commonCounts.put("", new HashMap<Object, Integer>()); for (String genre : corpus.getGenreCounts().keySet()) { if (corpus.getGenreCounts().get(genre) > 0) { allCounts.put(genre, new HashMap<Object, Integer>()); commonCounts.put(genre, new HashMap<Object, Integer>()); } } ArrayList<Double> means = new ArrayList<Double>(); for (Dialogue d : corpus.getDialogues()) { // if (!d.getId().startsWith("KB2")) { // continue; // } HashMap<Object, Integer> diaAllCounts = new HashMap<Object, Integer>(); HashMap<Object, Integer> diaCommonCounts = new HashMap<Object, Integer>(); List<Double> subScores = processDialogue(d, wb, speakerScores, originalSpks, speakerN, spkMetrics, totMetrics, wbcounts, allCounts, commonCounts, diaAllCounts, diaCommonCounts); System.out.println("Got " + subScores.size() + " scores for dialogue " + d.getId() + ": " + subScores); scores.add(subScores); // get stats Double mean = MathUtil.mean(subScores); means.add(mean); System.out.println("Mean for dialogue " + d.getId() + ": " + mean); ApacheStatistics subStats = new ApacheStatistics(subScores); System.out.println("Mean, SD for dialogue " + d.getId() + " = " + subStats.getMean() + " " + subStats.getStandardDeviation()); stats.addValues(subScores); } if (wb != null) { printSummarySheet(wb, null, speakerScores, originalSpks, speakerN, spkMetrics, totMetrics, corpus instanceof CombinedCorpus); if (counts) { printSummaryCountSheet(wbcounts, null, allCounts, commonCounts); } try { wb.write(xls); if (counts) { wbcounts.write(new FileOutputStream(new File("counts-" + runId + ".xlsx"))); } } catch (IOException e) { e.printStackTrace(); System.exit(0); } File summaryXls = new File(numTestsRun + " summary.xlsx"); File countsXls = new File("counts.xlsx"); Workbook summaryWb = null; Workbook countsWb = null; try { FileInputStream summaryXlsIn = new FileInputStream(summaryXls); summaryWb = new XSSFWorkbook(summaryXlsIn); if (counts) { FileInputStream countsXlsIn = new FileInputStream(countsXls); countsWb = new XSSFWorkbook(countsXlsIn); } } catch (FileNotFoundException e) { summaryWb = new XSSFWorkbook(); if (counts) { countsWb = new XSSFWorkbook(); } } catch (IOException e) { e.printStackTrace(); System.exit(0); } printSummarySheet(summaryWb, runId, speakerScores, originalSpks, speakerN, spkMetrics, totMetrics, corpus instanceof CombinedCorpus); if (counts) { printSummaryCountSheet(countsWb, runId, allCounts, commonCounts); } try { OutputStream summaryXlsOut = new FileOutputStream(summaryXls); summaryWb.write(summaryXlsOut); if (counts) { OutputStream countsXlsOut = new FileOutputStream(countsXls); countsWb.write(countsXlsOut); } } catch (IOException e) { e.printStackTrace(); System.exit(0); } } System.out.println("Mean over all dialogues: " + MathUtil.mean(means)); System.out.println("Mean, SD over all dialogues: " + stats.getMean() + " " + stats.getStandardDeviation()); ApacheStatistics meanStats = new ApacheStatistics(means); System.out.println("Mean, SD over all dialogue means: " + meanStats.getMean() + " " + meanStats.getStandardDeviation()); return scores; } /** * @param orig * @return a version which is less than 32 chars long, to keep the {@link Workbook} class restrictions happy */ String shorten(String orig) { String shorter = new String(orig); shorter = shorter.replace("stanford", "stn"); shorter = shorter.replace("random", "rd"); shorter = shorter.replace("nointj", "noi"); while (shorter.length() > 31) { // shorter = shorter.replaceFirst("-(\\w{2})\\w+", "-$1"); shorter = shorter.replaceFirst("-(\\w)\\w(\\w)\\w*", "-$1$2"); } if (!orig.equals(shorter)) { System.out.println("Shortened XLS sheet name " + orig + " -> " + shorter); } return shorter; } /** * Print a summary sheet on the (gulp) excel spreadsheet * * @param wb * @param sheetName * @param speakerScores * @param originalSpks * @param speakerN */ private void printSummarySheet(Workbook wb, String sheetName, HashMap<String, ArrayList<Double>> speakerScores, HashMap<String, String> originalSpks, HashMap<String, ArrayList<Double>> speakerN, MetricsMap spkMetrics, MetricsMap totMetrics, boolean pairedCorpus) { CreationHelper creationHelper = wb.getCreationHelper(); sheetName = (sheetName == null ? "Summary" : shorten(sheetName)); System.out.println("Checking workbook " + wb + " for sheet " + sheetName); Sheet sheet = wb.getSheet(sheetName); if (sheet != null) { System.out.println("Exists, removing sheet " + sheetName); wb.removeSheetAt(wb.getSheetIndex(sheet)); } sheet = wb.createSheet(sheetName); wb.setSheetOrder(sheetName, 0); int iRow = 0; // first general identifying stuff Row row = sheet.createRow(iRow++); row.createCell(0, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Corpus")); row.createCell(1, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(getCorpus().getId())); row = sheet.createRow(iRow++); row.createCell(0, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Windower")); row.createCell(1, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(getWin().toString())); row = sheet.createRow(iRow++); row.createCell(0, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Similarity Measure")); row.createCell(1, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(getSim().toString())); // now header row = sheet.createRow(iRow++); row = sheet.createRow(iRow++); int iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Speaker")); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Genre")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Orig Speaker")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Orig Genre")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Speaker #units")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Dialogue #units")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Speaker #words")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Dialogue #words")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Speaker #tokens")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Dialogue #tokens")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Speaker avg offset")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Dialogue avg offset")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Speaker avg wordrate")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Dialogue avg wordrate")); iCol++; for (int i = 0; i < getWin().getLeftWindowSize(); i++) { row.createCell(i + iCol, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Mean i-" + (i + 1))); } // now means per speaker List<String> spks = new ArrayList<String>(speakerScores.keySet()); Collections.sort(spks); List<Double> means = new ArrayList<Double>(); List<Double> nums = new ArrayList<Double>(); for (int i = 0; i < getWin().getLeftWindowSize(); i++) { means.add(0.0); nums.add(0.0); } int nAll = 0; int nMatch = 0; for (String spk : spks) { // System.out.println("org chk [" + originalSpks.get(spk) + "][" + spk + "]"); boolean matching = false; if ((originalSpks.get(spk) != null) && originalSpks.get(spk).contains(":")) { int li = originalSpks.get(spk).lastIndexOf(":"); String pre = originalSpks.get(spk).substring(0, li); String suf = originalSpks.get(spk).substring(li); matching = spk.startsWith(pre) && spk.endsWith(suf); } nAll++; if (!pairedCorpus || matching) { nMatch++; // System.out.println("match " + pre + " " + suf); row = sheet.createRow(iRow++); iCol = 0; String dId = spk.replaceFirst("(.*)_.*", "$1"); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(spk)); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue( creationHelper.createRichTextString(corpus.getGenreMap().get(spk.split(":")[0]))); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(originalSpks.get(spk))); row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper .createRichTextString(corpus.getGenreMap().get(originalSpks.get(spk).split(":")[0]))); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(spkMetrics.getNumUnits(spk)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(totMetrics.getNumUnits(dId)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(spkMetrics.getNumWords(spk)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(totMetrics.getNumWords(dId)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(spkMetrics.getNumTokens(spk)); row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue(totMetrics.getNumTokens(dId)); if (Double.isNaN(spkMetrics.getTurnOffset(spk)) || spkMetrics.getNumTurnOffsets(spk) == 0) { iCol++; } else { row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue( spkMetrics.getTurnOffset(spk) / (double) spkMetrics.getNumTurnOffsets(spk)); } if (Double.isNaN(totMetrics.getTurnOffset(dId)) || totMetrics.getNumTurnOffsets(dId) == 0) { iCol++; } else { row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC).setCellValue( totMetrics.getTurnOffset(dId) / (double) totMetrics.getNumTurnOffsets(dId)); } if (Double.isNaN(spkMetrics.getWordRate(spk)) || spkMetrics.getNumWordRates(spk) == 0) { iCol++; } else { row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC) .setCellValue(spkMetrics.getWordRate(spk) / (double) spkMetrics.getNumWordRates(spk)); } if (Double.isNaN(totMetrics.getWordRate(dId)) || totMetrics.getNumWordRates(dId) == 0) { iCol++; } else { row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC) .setCellValue(totMetrics.getWordRate(dId) / (double) totMetrics.getNumWordRates(dId)); } iCol++; for (int i = 0; i < speakerScores.get(spk).size(); i++) { if (speakerN.get(spk).get(i) > 0.0) { double mean = speakerScores.get(spk).get(i) / speakerN.get(spk).get(i); row.createCell(i + iCol, Cell.CELL_TYPE_NUMERIC).setCellValue(mean); means.set(i, means.get(i) + mean); nums.set(i, nums.get(i) + 1); } } } } System.out.println("Matched " + nMatch + " of " + nAll); // and a final row for overall means row = sheet.createRow(iRow++); iCol = 14; row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Overall")); for (int i = 0; i < getWin().getLeftWindowSize(); i++) { means.set(i, means.get(i) / nums.get(i)); row.createCell(i + iCol, Cell.CELL_TYPE_NUMERIC).setCellValue(means.get(i)); } } /** * Print a summary sheet on the (gulp) excel spreadsheet */ private void printSummaryCountSheet(Workbook wb, String sheetName, HashMap<String, HashMap<Object, Integer>> allCounts, HashMap<String, HashMap<Object, Integer>> commonCounts) { CreationHelper creationHelper = wb.getCreationHelper(); sheetName = (sheetName == null ? "Summary" : shorten(sheetName)); System.out.println("Checking workbook " + wb + " for sheet " + sheetName); Sheet sheet = wb.getSheet(sheetName); if (sheet != null) { System.out.println("Exists, removing sheet " + sheetName); wb.removeSheetAt(wb.getSheetIndex(sheet)); } sheet = wb.createSheet(sheetName); wb.setSheetOrder(sheetName, 0); int iRow = 0; // first general identifying stuff Row row = sheet.createRow(iRow++); row.createCell(0, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Corpus")); row.createCell(1, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(getCorpus().getId())); row = sheet.createRow(iRow++); row.createCell(0, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Windower")); row.createCell(1, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(getWin().toString())); row = sheet.createRow(iRow++); row.createCell(0, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Similarity Measure")); row.createCell(1, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(getSim().toString())); // now header row = sheet.createRow(iRow++); row = sheet.createRow(iRow++); int iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING).setCellValue(creationHelper.createRichTextString("Type")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Overall count")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString("Common count")); for (String genre : allCounts.keySet()) { if (genre.isEmpty()) continue; row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(genre + " overall count")); row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(genre + " common count")); } ArrayList<Object> keys = new ArrayList<Object>(allCounts.get("").keySet()); Collections.sort(keys, new DescendingComparator<Object>(allCounts.get(""))); for (Object key : keys) { row = sheet.createRow(iRow++); iCol = 0; row.createCell(iCol++, Cell.CELL_TYPE_STRING) .setCellValue(creationHelper.createRichTextString(key.toString())); Cell cell = row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC); if (allCounts.get("").get(key) != null) { cell.setCellValue(allCounts.get("").get(key)); } cell = row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC); if (commonCounts.get("").get(key) != null) { cell.setCellValue(commonCounts.get("").get(key)); } for (String genre : allCounts.keySet()) { if (genre.isEmpty()) continue; cell = row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC); if (allCounts.get(genre).get(key) != null) { cell.setCellValue(allCounts.get(genre).get(key)); } cell = row.createCell(iCol++, Cell.CELL_TYPE_NUMERIC); if (commonCounts.get(genre).get(key) != null) { cell.setCellValue(commonCounts.get(genre).get(key)); } } } } public DialogueCorpus getCorpus() { return corpus; } public void setCorpus(DialogueCorpus corpus) { this.corpus = corpus; } public SimilarityMeasure<X> getSim() { return sim; } public void setSim(SimilarityMeasure<X> sim) { this.sim = sim; } public DialogueWindower<X> getWin() { return win; } public void setWin(DialogueWindower<X> win) { this.win = win; } /** * Normalise a data array depending on the normalisation setting * * @param data */ private void normalise(List<Double> data) { double norm = 1.0; if (normalisation == NORM_MAX) { norm = Collections.max(data); } else if (normalisation == NORM_MEAN) { double sum = 0.0; for (Double datum : data) { sum += datum; } norm = (sum / data.size()); } for (int i = 0; i < data.size(); i++) { data.set(i, data.get(i) / norm); } } /** * @param raw * the raw data * @param n * the number of desired bins (> raw.size()) * @return a list of length n via linear interpolation of data */ private List<Double> interpolate(List<Double> raw, int n) { if (n < raw.size()) { throw new RuntimeException("Can only interpolate to longer array"); } List<Double> cooked = new ArrayList<Double>(); for (double j = 0.0; j < n; j++) { double ind = j * (double) (raw.size() - 1) / (double) (n - 1); double last = Math.floor(ind); double next = Math.ceil(ind); double val = (last == next) ? raw.get((int) last) : raw.get((int) last) * (next - ind) + raw.get((int) next) * (ind - last); // System.out.println("I " + j + " " + ind + " " + last + " " + raw.get((int) last) + " " + next + " " // + raw.get((int) next) + " " + val); cooked.add((int) j, val); } return cooked; } /** * Average and plot the scores * * @param scores * the scores from processCorpus * @param num * number of data points (e.g. turns) per dialogue to interpolate to */ public void processScores(List<List<Double>> scores, int num) { List<Double> means = new ArrayList<Double>(); for (int m = 0; m < num; m++) { means.add(0.0); } int i = 0; for (List<Double> subScores : scores) { System.out.println("Dialogue " + i + " raw scores: " + subScores); normalise(subScores); System.out.println("Dialogue " + i + " norm scores: " + subScores); List<Double> smoothScores = smoother.smooth(subScores); System.out.println("Dialogue " + i + " smooth scores: " + smoothScores); List<Double> intScores = interpolate(smoothScores, num); System.out.println("Dialogue " + i + " intp scores: " + intScores); plotScores("Dialogue " + i, subScores, smoothScores, intScores); for (int m = 0; m < num; m++) { means.set(m, means.get(m) + intScores.get(m)); } i++; } for (int m = 0; m < num; m++) { means.set(m, means.get(m) / (double) i); } plotScores(means, "Mean over " + i + " dialogues"); } /** * Plot a turn vs score chart * * @param scores * @param title */ private void plotScores(List<Double> scores, String title) { ApplicationFrame af = new ApplicationFrame(title); final XYSeries series = new XYSeries("Coherence scores"); for (int x = 0; x < scores.size(); x++) { series.add(x, scores.get(x)); } final XYSeriesCollection data = new XYSeriesCollection(series); final JFreeChart chart = ChartFactory.createXYLineChart(title, "Turn", "Score", data, PlotOrientation.VERTICAL, true, true, false); final ChartPanel chartPanel = new ChartPanel(chart); chartPanel.setPreferredSize(new java.awt.Dimension(1000, 500)); af.setContentPane(chartPanel); af.pack(); RefineryUtilities.centerFrameOnScreen(af); af.setVisible(true); } /** * Plot a turn vs score chart with arbitrary subplots * * @param scores1 * @param title */ private void plotScores(String title, List<Double>... scores) { ApplicationFrame af = new ApplicationFrame(title); final CombinedDomainXYPlot plot = new CombinedDomainXYPlot(new NumberAxis()); plot.setGap(10.0); for (int i = 0; i < scores.length; i++) { final XYSeries series = new XYSeries("Coherence scores " + i); for (int x = 0; x < scores[i].size(); x++) { series.add(x, scores[i].get(x)); } XYSeriesCollection data = new XYSeriesCollection(series); XYPlot subplot = new XYPlot(data, null, new NumberAxis(), new StandardXYItemRenderer()); subplot.setRangeAxisLocation(i == 0 ? AxisLocation.TOP_OR_LEFT : AxisLocation.BOTTOM_OR_LEFT); plot.add(subplot, 1); } plot.setOrientation(PlotOrientation.VERTICAL); final JFreeChart chart = new JFreeChart(title, JFreeChart.DEFAULT_TITLE_FONT, plot, false); final ChartPanel chartPanel = new ChartPanel(chart); chartPanel.setPreferredSize(new java.awt.Dimension(1000, 500)); af.setContentPane(chartPanel); af.pack(); RefineryUtilities.centerFrameOnScreen(af); af.setVisible(true); } /** * Run a test, optionally producing a XLS spreadsheet and some pretty graphs * * @param baseDir * the base corpus dir, or null if using default * @param corpusRoot * "dcpse" or "swbd" * @param randType * "" for the raw corpus, "random1", "random2" etc for a defined randomisation type * @param simType * "lex" or "syn", or "syntop", "synbot" for top 10/other rules only * @param unitType * "turn" or "sent" * @param winType * "oth" or "same" * @param monteCarlo * iteration for MC * @param xlsOutput * whether to write XLS spreadsheet * @param plotGraphs * whether to plot graphs */ public static void runTest(String baseDir, String corpusRoot, String randType, String simType, String unitType, String winType, int monteCarlo, boolean xlsOutput, boolean plotGraphs) { numTestsRun++; String randSuffix = (randType.isEmpty() ? "" : "_" + randType + (monteCarlo < 0 ? "" : "_" + monteCarlo)); String runId = numTestsRun + " " + corpusRoot + randSuffix + "-" + simType + "-" + winType + "-" + unitType; File xlsFile = new File(runId + ".xlsx"); AlignmentTester<? extends DialogueUnit> at; int leftWindow = ((simType.equals("gries") || winType.startsWith("all")) ? 1 : 5); int rightWindow = 1; int stepWindow = 1; if (unitType.equals("turn") || unitType.equals("tuco")) { AlignmentTester<DialogueTurn> att = (xlsOutput ? new AlignmentTester<DialogueTurn>(xlsFile) : new AlignmentTester<DialogueTurn>()); if (winType.equals("oth")) { att.setWin(new OtherSpeakerTurnWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("sam")) { att.setWin(new SameSpeakerTurnWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("alloth")) { att.setWin(new OtherSpeakerAllOtherTurnWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("allsam")) { att.setWin(new SameSpeakerAllOtherTurnWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("any")) { att.setWin(new TurnWindower(null, leftWindow, rightWindow, stepWindow)); } else { throw new RuntimeException("unknown win type " + winType); } if (simType.equals("lex")) { if (unitType.equals("tuco")) { att.setSim(new TurnConcatSimilarityMeasure(new SentenceLexicalSimilarityMeasure())); } else { att.setSim(new TurnAverageSimilarityMeasure(new SentenceLexicalSimilarityMeasure())); } } else if (simType.equals("tok")) { if (unitType.equals("tuco")) { att.setSim(new TurnConcatSimilarityMeasure(new SentenceLexicalTokenSimilarityMeasure())); } else { att.setSim(new TurnAverageSimilarityMeasure(new SentenceLexicalTokenSimilarityMeasure())); } } else if (simType.startsWith("syn")) { if (unitType.equals("tuco")) { att.setSim(new TurnConcatSimilarityMeasure( new SentenceSyntacticSimilarityMeasure(TreeKernel.SYN_TREES))); } else { att.setSim(new TurnAverageSimilarityMeasure( new SentenceSyntacticSimilarityMeasure(TreeKernel.SYN_TREES))); } } else if (simType.equals("gries")) { // HACK windower doesn't control person for Gries-style similarity if (winType.equals("oth")) { if (unitType.equals("tuco")) { att.setSim( new TurnConcatSimilarityMeasure(new SentenceLastConstructionOtherSimilarityMeasure( ".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*"))); } else { att.setSim( new TurnAverageSimilarityMeasure(new SentenceLastConstructionOtherSimilarityMeasure( ".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*"))); } } else if (winType.equals("sam")) { if (unitType.equals("tuco")) { att.setSim( new TurnConcatSimilarityMeasure(new SentenceLastConstructionSameSimilarityMeasure( ".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*"))); } else { att.setSim( new TurnAverageSimilarityMeasure(new SentenceLastConstructionSameSimilarityMeasure( ".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*"))); } } else if (winType.equals("any")) { if (unitType.equals("tuco")) { att.setSim(new TurnConcatSimilarityMeasure(new SentenceLastConstructionSimilarityMeasure( ".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*"))); } else { att.setSim(new TurnAverageSimilarityMeasure(new SentenceLastConstructionSimilarityMeasure( ".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*"))); } } } else { throw new RuntimeException("unknown sim type " + simType); } at = att; } else if (unitType.equals("sent")) { AlignmentTester<DialogueSentence> ats = (xlsOutput ? new AlignmentTester<DialogueSentence>(xlsFile) : new AlignmentTester<DialogueSentence>()); if (winType.equals("oth")) { ats.setWin(new OtherSpeakerSentenceWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("sam")) { ats.setWin(new SameSpeakerSentenceWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("alloth")) { ats.setWin(new OtherSpeakerAllOtherSentenceWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("allsam")) { ats.setWin(new SameSpeakerAllOtherSentenceWindower(null, leftWindow, rightWindow, stepWindow)); } else if (winType.equals("any")) { ats.setWin(new SentenceWindower(null, leftWindow, rightWindow, stepWindow)); } else { throw new RuntimeException("unknown win type " + winType); } if (simType.equals("lex")) { ats.setSim(new SentenceLexicalSimilarityMeasure()); } else if (simType.startsWith("tok")) { ats.setSim(new SentenceLexicalTokenSimilarityMeasure()); } else if (simType.startsWith("syn")) { ats.setSim(new SentenceSyntacticSimilarityMeasure(TreeKernel.SYN_TREES)); } else if (simType.equals("gries")) { // HACK windower doesn't control person for Gries-style similarity if (winType.equals("oth")) { ats.setSim(new SentenceLastConstructionOtherSimilarityMeasure(".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*")); } else if (winType.equals("sam")) { ats.setSim(new SentenceLastConstructionSameSimilarityMeasure(".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*")); } else if (winType.equals("any")) { ats.setSim(new SentenceLastConstructionSimilarityMeasure(".*:VP\\(ditr.*", ".*:VP\\(montr.*:PP\\((to|for)\\b.*")); } } else { throw new RuntimeException("unknown sim type " + simType); } at = ats; } else { throw new RuntimeException("unknown unit type " + unitType); } // // use something like this to test on the original corpus (limiting to 2-speaker cases) // at.setCorpus(new DCPSECorpus(2, 2, 0, 0)); // // use something like this if you're not running on the server // at.setCorpus(new DCPSECorpus("C:/Documents and Settings/mpurver/My Documents/corpora", 2, 2, 0, 0)); // // use something like this to test on a randomised version // at.setCorpus(new RandomCorpus(new DCPSECorpus(2, 2, 0, 0), RandomCorpus.RAND_ALL_TURNS, RandomCorpus.PAD_CUT, // RandomCorpus.LENGTH_IN_TURNS)); // // use something like this to test on a randomised version and save it to file for later replication // DialogueCorpus corpus = new RandomCorpus(new DCPSECorpus(2, 2, 0, 0), RandomCorpus.RAND_OTHER_SPEAKERS, // RandomCorpus.PAD_CUT); // corpus.writeToFile(new File(corpusName + ".corpus.gz")); // at.setCorpus(corpus); // // use something like this to use a previously generated random corpus // at.setCorpus(DialogueCorpus.readFromFile(new File(corpusName + ".corpus.gz"))); // // use something like this to (re-)parse a corpus // CorpusParser.parse(corpus); DialogueCorpus corpus = getCorpus(baseDir, corpusRoot, randSuffix, randType); if (randType.equals("random1") || randType.equals("random3") || randType.equals("random_same")) { // for random1, must actually set up two corpora, one for each speaker A and B ArrayList<DialogueCorpus> corpusPair = new ArrayList<DialogueCorpus>(); corpusPair.add(corpus); corpusPair.add( getCorpus(baseDir, corpusRoot, randSuffix.replace(randType, randType + "B"), randType + "B")); corpus = new CombinedCorpus(corpusPair); } at.setCorpus(corpus); TreeKernel.clearAllowedProductions(); TreeKernel.clearBannedProductions(); if (simType.equals("syntop")) { for (String bnf : corpus.topTenSynProductions()) { TreeKernel.addAllowedProduction(bnf); } } else if (simType.equals("synbot")) { for (String bnf : corpus.topTenSynProductions()) { TreeKernel.addBannedProduction(bnf); } } at.normalisation = NORM_NONE; // at.smoother = SmoothingFactory.getSmoother("gaussian(5)"); int num = 300; List<List<Double>> scores = at.processCorpus(runId); if (plotGraphs) { at.processScores(scores, num); } } private static DialogueCorpus getCorpus(String baseDir, String corpusRoot, String randSuffix, String randType) { String corpusName = corpusRoot + randSuffix + ".corpus"; DialogueCorpus corpus = DialogueCorpus.readFromFile(new File(baseDir + corpusName)); if (corpus == null) { if (!randType.isEmpty()) { corpus = DialogueCorpus.readFromFile(new File(baseDir + corpusRoot + ".corpus")); } if (corpus == null) { if (corpusRoot.startsWith("dcpse")) { CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_PAUSE, true); CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_IGNORE, true); CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_UNCLEAR, true); if (corpusRoot.endsWith("nointj")) { CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_UMM, true); CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_REACT, true); } else { CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_UMM, false); CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_REACT, false); } if (corpusRoot.startsWith("dcpsef")) { CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.INCLUDE_NO_BRACKETS, false); } if (corpusRoot.startsWith("dcpsefp")) { CreateTreeFromDCPSE.setOption(CreateTreeFromDCPSE.PP_LEXICAL_FEATURES, true); } corpus = (baseDir == null ? new DCPSECorpus(2, 2, 10, 0) : new DCPSECorpus(baseDir, 2, 2, 10, 0)); } else if (corpusRoot.startsWith("swbd")) { CreateTreeFromSWBD.setOption(CreateTreeFromSWBD.REPAIR_SELFREPAIRS, true); CreateTreeFromSWBD.setOption(CreateTreeFromSWBD.SIMPLIFY_CATEGORIES, true); if (corpusRoot.endsWith("nointj")) { CreateTreeFromSWBD.setOption(CreateTreeFromSWBD.INCLUDE_NO_INTJ, true); } else { CreateTreeFromSWBD.setOption(CreateTreeFromSWBD.INCLUDE_NO_INTJ, false); } corpus = (baseDir == null ? new SwitchboardCorpus(2, 2, 10, 0) : new SwitchboardCorpus(baseDir, 2, 2, 10, 0)); } else if (corpusRoot.startsWith("bnc")) { if (corpusRoot.endsWith("nointj")) { throw new RuntimeException("not implemented yet"); } // can't impose genre count as no separate metadata file - but BNC has loads in each genre anyway corpus = (baseDir == null ? new BNCCorpus(2, 2, 0, 0) : new BNCCorpus(baseDir, 2, 2, 0, 0)); if (corpusRoot.endsWith("ccg") || corpusRoot.endsWith("stanford") || corpusRoot.endsWith("rasp")) { throw new RuntimeException( "parsed corpus doesn't exist - better to use BNCCorpus and/or CorpusParser to create, you get more options that way!"); } } } if (!randType.isEmpty()) { // for "nointj" versions, keep the same randomisation assignment as the non-"nointj" if possible // this isn't possible with trimmed corpora, as parts of the original assignment may have been trimmed if (corpusRoot.contains("nointj") && !corpusRoot.contains("trim")) { RandomCorpus.setCorpusToCopy(DialogueCorpus.readFromFile( new File(corpusRoot.replaceAll("_nointj", "") + randSuffix + ".corpus.gz"))); corpus = new RandomCorpus(corpus, RandomCorpus.RAND_COPY_CORPUS, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_TURNS, true, true); } else { int spkOffset = 0; if (randType.endsWith("B")) { spkOffset = 1; randType = randType.substring(0, randType.length() - 1); } if (randType.equals("random1")) { corpus = new RandomCorpus(corpus, RandomCorpus.RAND_OTHER_TURNS, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_TURNS, true, true, spkOffset); } else if (randType.equals("random2")) { corpus = new RandomCorpus(corpus, RandomCorpus.RAND_BEST_LENGTH_MATCH, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_TURNS, true, true, spkOffset); } else if (randType.equals("random3")) { corpus = new RandomCorpus(corpus, RandomCorpus.RAND_BEST_LENGTH_RAND, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_TURNS, true, true, spkOffset); } else if (randType.equals("random4")) { corpus = new RandomCorpus(corpus, RandomCorpus.RAND_ALL_TURNS, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_TURNS, true, true, spkOffset); } else if (randType.equals("random5")) { corpus = new RandomCorpus(corpus, RandomCorpus.RAND_ALL_SENTS, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_SENTS, true, true, spkOffset); } else if (randType.equals("random_same")) { corpus = new RandomCorpus(corpus, RandomCorpus.RAND_SAME_SPEAKER, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_TURNS, true, false, spkOffset); } else if (randType.equals("random_s2me")) { corpus = new RandomCorpus(corpus, RandomCorpus.RAND_S2ME_SPEAKER, RandomCorpus.PAD_CUT, RandomCorpus.LENGTH_IN_TURNS, true, false, spkOffset); } } } corpus.writeToFile(new File(corpusName)); } return corpus; } /** * Run a test * * @param args */ public static void main(String[] args) { String[] base = { // "/import/imc-corpora/corpora/dcpse", "/Users/mpurver/SkyDrive/QMUL/imc-corpora/data/align", // "/import/imc-corpora/kcl/ldc/treebank_3", "/Users/mpurver/SkyDrive/QMUL/imc-corpora/data/align", // "/import/imc-corpora/corpora/bnc/bnc-xml", "/Users/mpurver/SkyDrive/QMUL/imc-corpora/data/align", // "/import/imc-corpora/corpora/bnc/bnc-xml" // "C:/Documents and Settings/mpurver/My Documents/corpora/bnc" }; // String[] corpus = { "dcpse", "swbd", "bnc_trim", "dcpse_nointj", "swbd_nointj", "bnc_nointj_trim" }; // String[] corpus = { /* "dcpse", */"swbd", /* // * "bnc_trim_stanford", "bnc_trim_ccg", /* "dcpse_nointj", // * "swbd_nointj", "bnc_nointj_trim_stanford", "bnc_nointj_trim_ccg" // */}; // String[] corpus = { "dcpsefp" }; String[] corpus = { "dcpse", "swbd", "bnc_trim_ccg" }; String[] rand = { /* "", */"random1" /* * , "random2", "random3", "random4", "random5", "random_same", "random_s2me" */ }; String[] sim = { "lex", /* "tok", */"syn", "syntop", "synbot" /* , "gries" */ }; String[] unit = { "turn", /* "tuco", "sent" */ }; String[] win = { "oth", "sam" /* , "any" */ }; int monteCarlo = 0; // number of repetitions for MC for (int i = 0; i < args.length; i++) { if (args[i].startsWith("-C")) { corpus = args[i].replaceFirst("-C", "").split(","); System.out.println("Got corpus array from clargs: " + Arrays.toString(corpus)); } else if (args[i].startsWith("-R")) { rand = args[i].replaceFirst("-R", "").split(","); System.out.println("Got rand array from clargs: " + Arrays.toString(rand)); } else if (args[i].startsWith("-S")) { sim = args[i].replaceFirst("-S", "").split(","); System.out.println("Got sim array from clargs: " + Arrays.toString(sim)); } else if (args[i].startsWith("-U")) { unit = args[i].replaceFirst("-U", "").split(","); System.out.println("Got unit array from clargs: " + Arrays.toString(unit)); } else if (args[i].startsWith("-W")) { win = args[i].replaceFirst("-W", "").split(","); System.out.println("Got win array from clargs: " + Arrays.toString(win)); } else if (args[i].startsWith("-M")) { monteCarlo = Integer.parseInt(args[i].replaceFirst("-M", "")); System.out.println("Got monte-carlo rounds: " + monteCarlo); } } for (int i = 0; i < corpus.length; i++) { for (int j = 0; j < rand.length; j++) { for (int k = 0; k < sim.length; k++) { for (int l = 0; l < unit.length; l++) { for (int m = 0; m < win.length; m++) { // no point doing random on same-person case if (rand[j].isEmpty() || (!rand[j].contains("s2me") && !win[m].equals("sam")) || (rand[j].contains("s2me") && win[m].equals("sam"))) { if ((monteCarlo < 1) || rand[j].isEmpty()) { runTest(base[i % base.length], corpus[i], rand[j], sim[k], unit[l], win[m], -1, true, false); } else { for (int mc = 0; mc < monteCarlo; mc++) { runTest(base[i % base.length], corpus[i], rand[j], sim[k], unit[l], win[m], mc, true, false); } } } } } } } } } }