Java tutorial
/* * Copyright (C) 2012 The Stanford MobiSocial Laboratory * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.stanford.muse.index; import edu.stanford.muse.datacache.Blob; import edu.stanford.muse.datacache.BlobStore; import edu.stanford.muse.email.AddressBook; import edu.stanford.muse.email.CalendarUtil; import edu.stanford.muse.email.Contact; import edu.stanford.muse.groups.Group; import edu.stanford.muse.groups.SimilarGroup; import edu.stanford.muse.util.*; import edu.stanford.muse.webapp.JSPHelper; import edu.stanford.muse.webapp.ModeConfig; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.io.*; import java.text.SimpleDateFormat; import java.util.*; import java.util.regex.Pattern; import java.util.stream.Collectors; /** useful utilities for indexing */ public class IndexUtils { private static Log log = LogFactory.getLog(IndexUtils.class); /** temporary method */ public static boolean query(String s, String query) { List<String> tokens = Util.tokenize(query, "|"); s = s.toLowerCase(); for (String t : tokens) { t = t.trim(); if (Util.nullOrEmpty(t)) continue; if (s.contains(t)) return true; } return false; } public static List<MultiDoc> partitionDocsByCategory(Collection<? extends Document> allDocs) { Map<String, MultiDoc> map = new LinkedHashMap<String, MultiDoc>(); for (Document d : allDocs) { CategoryDocument cd = (CategoryDocument) d; MultiDoc docs = map.get(cd.category); if (docs == null) { docs = new MultiDoc(map.size(), cd.category); map.put(cd.category, docs); } docs.add(d); } List<MultiDoc> result = new ArrayList<MultiDoc>(); for (MultiDoc docs : map.values()) result.add(docs); return result; } public static List<MultiDoc> partitionDocsByInterval(Collection<? extends DatedDocument> allDocs, boolean monthsNotYears) { List<MultiDoc> result = new ArrayList<MultiDoc>(); if (allDocs.size() == 0) return result; Pair<Date, Date> p = EmailUtils.getFirstLast(allDocs); Date first = p.getFirst(); Date last = p.getSecond(); // compute the monthly intervals List<Date> intervals; if (monthsNotYears) intervals = Util.getMonthlyIntervals(first, last); else intervals = Util.getYearlyIntervals(first, last); int nIntervals = intervals.size() - 1; for (int i = 0; i < nIntervals; i++) { String clusterDescription; Date d = intervals.get(i); GregorianCalendar c = new GregorianCalendar(); c.setTime(d); if (!monthsNotYears) clusterDescription = Integer.toString(c.get(Calendar.YEAR)); else clusterDescription = CalendarUtil.getDisplayMonth(c) + " " + c.get(Calendar.YEAR); result.add(new MultiDoc(i, clusterDescription)); } for (DatedDocument ed : allDocs) { // find which interval this email belongs to int selectedInterval = -1; // TODO: if all this API does is either partition by month or year then no need to "search". Date c = ed.date; for (int i = 0; i < nIntervals; i++) { Date intervalStart = intervals.get(i); Date intervalEnd = intervals.get(i + 1); if (!c.before(intervalStart) && c.before(intervalEnd)) { selectedInterval = i; break; } } // this doc goes into interval # selectedInterval MultiDoc whichList = result.get(selectedInterval); whichList.add(ed); } return result; } /** * returns a map of group name -> set of docs associated with that group. * if trackNotInAnyGroup is true, adds a special group called none. */ public static Map<String, Set<EmailDocument>> partitionDocsByGroup(Collection<EmailDocument> allDocs, List<SimilarGroup<String>> groups, AddressBook addressBook, boolean trackNotInAnyGroup) { String NOTAGroupName = "None"; // NOTA = none of the above Map<String, Set<EmailDocument>> map = new LinkedHashMap<String, Set<EmailDocument>>(); if (allDocs.size() == 0) return map; GroupAssigner ca = new GroupAssigner(); ca.setupGroups(allDocs, groups, addressBook, 0); for (EmailDocument ed : allDocs) { Map.Entry<Integer, Float> e = GroupAssigner.highestValueEntry(ca.getAssignedColorWeights(ed)); int groupNum = (e != null) ? e.getKey() : -1; boolean invalidGroup = groupNum < 0 || groupNum >= groups.size(); if (invalidGroup && !trackNotInAnyGroup) continue; String groupName = (invalidGroup) ? NOTAGroupName : groups.get(groupNum).name; Set<EmailDocument> docsForGroup = map.get(groupName); if (docsForGroup == null) { docsForGroup = new LinkedHashSet<EmailDocument>(); map.put(groupName, docsForGroup); } docsForGroup.add(ed); } // now sort so that groups in the map are in original order Map<String, Set<EmailDocument>> sortedMap = new LinkedHashMap<String, Set<EmailDocument>>(); for (SimilarGroup<String> group : groups) { Set<EmailDocument> docs = map.get(group.name); if (docs != null) sortedMap.put(group.name, docs); } Set<EmailDocument> docs = map.get(NOTAGroupName); if (docs != null) sortedMap.put(NOTAGroupName, docs); return sortedMap; } /** replaces all tokens in the given text that are not in any of the entities in the given doc. * all other tokens are replaced with REDACTION_CHAR. * token is defined as a consecutive sequence of letters or digits * Note: all other characters (incl. punctuation, special symbols) are blindly copied through * anything not captured in a token is considered non-sensitive and is passed through */ public static String retainOnlyNames(String text, org.apache.lucene.document.Document doc) { StringBuilder result = new StringBuilder(); Set<String> allowedTokens = new LinkedHashSet<>(); // assemble all the allowed tokens (lower cased) from these 3 types of entities { List<String> allEntities = Arrays.asList(Archive.getAllNamesInLuceneDoc(doc, true)).stream() .map(Span::getText).collect(Collectors.toList()); for (String e : allEntities) allowedTokens.addAll(Util.tokenize(e.toLowerCase())); // names may sometimes still have punctuation; strip it. e.g. a name like "Rep. Duncan" should lead to the tokens "rep" and "duncan" allowedTokens = allowedTokens.stream().map(s -> Util.stripPunctuation(s)).collect(Collectors.toSet()); } final char REDACTION_CHAR = '.'; int idx = 0; boolean previousTokenAllowed = false; outer: while (true) { StringBuilder token = new StringBuilder(); // go through all the chars one by one, either passing them through or assembling them in a token that can be looked up in allowedTokens { // skip until start of next token, passing through chars to result // the letter pointed to by idx has not yet been processed while (true) { if (idx >= text.length()) break outer; char ch = text.charAt(idx++); if (Character.isLetter(ch) || Character.isDigit(ch)) { // if other chars are judged sensitive in the future, this condition should be updated token.append(ch); break; } else result.append(ch); } } Character ch; { // now, idx is just past the start of a token (with the first letter stored in token), // keep reading letters until we find a non-letter, adding it to the token // the letter pointed to by idx has not yet been processed while (true) { ch = null; if (idx >= text.length()) break; // only break out of inner loop here, not the outer. this might be the last token, and token may have some residual content, so it has to be processed ch = text.charAt(idx++); if (!Character.isLetter(ch) && !Character.isDigit(ch)) break; token.append(ch); } } // ch contains the first char beyond the token (if it is not null). If it is null, it means we have reached the end of the string // look up the token and allow it only if allowedTokens contains it // use lower case token for comparison, but when appending to result, use the original string with the original case // worried about "A" grade, we should disallow it although it could easily be a token in a name somewhere String lowerCaseToken = token.toString().toLowerCase(); // ctoken = canonicalized token boolean allowToken = allowedTokens.contains(lowerCaseToken); // however, if this token is a stop word, only allow if previous token was allowed because we don't want to start from a stop word. // note: this will still allow the stop word if it is at the beginning of a sentence, and the prev. sentence ended in an allowed token if (allowToken && DictUtils.isJoinWord(lowerCaseToken)) allowToken = previousTokenAllowed; if (allowToken) result.append(token); else for (int j = 0; j < token.length(); j++) result.append(REDACTION_CHAR); previousTokenAllowed = allowToken; if (ch != null) result.append(ch); } return result.toString(); } public static class Window { public Date start; public Date end; public List<EmailDocument> docs; public String toString() { return "[" + CalendarUtil.getDisplayMonth(start) + " - " + CalendarUtil.getDisplayMonth(end) + "), " + docs.size() + " messages"; } } /** returns list of list of docs organized by a series of time windows */ public static List<Window> docsBySlidingWindow(Collection<EmailDocument> allDocs, int windowSizeInMonths, int stepSizeInMonths) { List<Window> result = new ArrayList<Window>(); if (allDocs.size() == 0) return result; // compute the begin and end date of the corpus Date first = null; Date last = null; for (EmailDocument ed : allDocs) { Date d = ed.date; if (d == null) { // drop this ed log.warn("Warning: null date on email: " + ed.getHeader()); continue; } if (first == null || d.before(first)) first = d; if (last == null || d.after(last)) last = d; } // compute the monthly intervals List<Pair<Date, Date>> intervals = Util.getSlidingMonthlyIntervalsBackward(first, last, windowSizeInMonths, stepSizeInMonths); int nIntervals = intervals.size(); for (int i = 0; i < nIntervals; i++) { Window w = new Window(); w.start = intervals.get(i).getFirst(); w.end = intervals.get(i).getSecond(); w.docs = new ArrayList<EmailDocument>(); result.add(w); } // for each message, add it to all intervals it belongs to // can be made more efficient by first sorting allDocs by date etc // but may not be needed except for a very large # of intervals and // a large # of docs for (EmailDocument ed : allDocs) { Date d = ed.date; if (d == null) continue; // add ed to all intervals that c falls in for (int i = 0; i < nIntervals; i++) { Pair<Date, Date> interval = intervals.get(i); Date intervalStart = interval.getFirst(); Date intervalEnd = interval.getSecond(); if (d.equals(intervalStart) || d.equals(intervalEnd) || (d.after(intervalStart) && d.before(intervalEnd))) { result.get(i).docs.add(ed); break; } } } return result; } public static void dumpDocument(String prefix, String bodyText) throws IOException { // dump contents PrintWriter pw1 = new PrintWriter(new FileOutputStream(prefix + ".txt")); pw1.println(bodyText); pw1.close(); } // read all the headers public static List<Document> findAllDocs(String prefix) throws ClassNotFoundException, IOException { List<Document> allDocs = new ArrayList<Document>(); // weird: sometimes we get a double-slash or double-backslash which kills the matching... // better canonicalize first, which calling new File() and then getAbsolutePath does prefix = new File(prefix).getAbsolutePath(); String dir = "."; int x = prefix.lastIndexOf(File.separator); if (x >= 0) dir = prefix.substring(0, x); File dirFile = new File(dir); // select valid header files File files[] = dirFile.listFiles(new Util.MyFilenameFilter(prefix, ".header")); for (File f : files) { try { Document d = null; ObjectInputStream headerOIS = new ObjectInputStream(new FileInputStream(f)); d = (Document) headerOIS.readObject(); headerOIS.close(); allDocs.add(d); } catch (Exception e) { Util.print_exception(e, log); } } System.out.println(allDocs.size() + " documents found with prefix " + prefix); return allDocs; } // read all the headers // public static List<Document> findAllDocsInJar(String prefix) throws ClassNotFoundException, IOException // { // String dir = Util.dirName(prefix); // String file = Util.baseName(prefix); // JarDocCache cache = new JarDocCache(dir); // List<Document> allDocs = new ArrayList<Document>(cache.getAllHeaders(file).values()); // return allDocs; // } static String capTo2Tokens(String s) { StringTokenizer st = new StringTokenizer(s); int nTokens = st.countTokens(); if (nTokens <= 2) return s; else return st.nextToken() + " " + st.nextToken(); } /** * splits the input string into words e.g. * if the input is "a b c", return a list containing "a" "b" and "c" * if input is "a" a list containing just "a" is returned. * input must have at least one token. * * @param s * @return */ public static List<String> splitIntoWords(String s) { List<String> result = new ArrayList<String>(); if (s == null) return result; StringTokenizer st = new StringTokenizer(s); while (st.hasMoreTokens()) { String term = st.nextToken(); if (term.startsWith("\"")) { term = term.substring(1); // skip the leading " while (st.hasMoreTokens()) { term += " " + st.nextToken(); if (term.endsWith("\"")) { term = term.substring(0, term.length() - 1); // skip the trailing " break; } } } result.add(term); } return result; } /** * splits the input string into pairs of words e.g. * if the input is "a b c", return a list containing "a b" and "b c". * if input is "a" a list containing just "a" is returned. * if input is "a b", a list containing just "a b" is returned. * input must have at least one token. * * @param s * @return */ public static List<String> splitIntoPairWords(String s) { List<String> result = new ArrayList<String>(); if (s == null) return result; StringTokenizer st = new StringTokenizer(s); if (!st.hasMoreTokens()) return result; String s1 = st.nextToken(); if (!st.hasMoreTokens()) { result.add(s1); return result; } while (st.hasMoreTokens()) { String s2 = st.nextToken(); if (DictUtils.isJoinWord(s2)) continue; result.add(s1 + " " + s2); s1 = s2; } return result; } /** * splits the input search query string into indiv. words. * e.g. a b|"c d"|e returns a list of length 4: a, b, "c d", e * * @return */ public static List<String> getAllWordsInQuery(String s) { List<String> result = new ArrayList<String>(); StringTokenizer st = new StringTokenizer(s, "|"); while (st.hasMoreTokens()) { // StringTokenizer st1 = new StringTokenizer(st.nextToken()); // while (st1.hasMoreTokens()) { // String word = st1.nextToken().trim().toLowerCase(); // canonicalize and add String word = st.nextToken(); word = word.trim(); if (word.length() > 0) result.add(word); } } return result; } /** * returns docs with ALL the given email/names. looks for all aliases of the * person's name, not just the given one */ public static Set<Document> selectDocsByPersons(AddressBook ab, Collection<EmailDocument> docs, String[] emailOrNames) { return selectDocsByPersons(ab, docs, emailOrNames, null); } public static List<Document> selectDocsByPersonsAsList(AddressBook ab, Collection<EmailDocument> docs, String[] emailOrNames) { return new ArrayList<Document>(selectDocsByPersons(ab, docs, emailOrNames, null)); } /** * returns docs with any/all the given email/names. looks for all aliases of * the person's name, not just the given one. * runs through all docs times # given emailOrNames. */ private static Set<Document> selectDocsByPersons(AddressBook ab, Collection<EmailDocument> docs, String[] emailOrNames, int[] contactIds) { if (ab == null) { // no addressbook, return everything Set<Document> result = new LinkedHashSet<Document>(); result.addAll(docs); return result; } Set<Contact> contactSet = new LinkedHashSet<Contact>(); if (emailOrNames != null) { for (String e : emailOrNames) { if (e.contains(" ")) { // not a single token, likely a specific name, e.g., "john doe" Contact c = ab.lookupByEmailOrName(e); if (c != null) contactSet.add(c); } else { // single token (partial name), use lookup that returns a set, e.g., "john" //@vihari: BUG-FIX: throws Null-pointer exception when ab.lookupByNameTokenAsSet is null. if (ab.lookupByNameTokenAsSet(e) == null) { log.info("Null pointer for: " + e); continue; } else contactSet.addAll(ab.lookupByNameTokenAsSet(e)); } if (contactSet.isEmpty()) log.info("Unknown email/name " + e); } } if (contactIds != null) { for (int id : contactIds) { Contact c = ab.getContact(id); if (c == null) log.info("Unknown contact ID " + id); else contactSet.add(c); } } if (contactSet.isEmpty()) return new LinkedHashSet<Document>(); // return an empty list // consider map impl in future where we can go directly from the names to the messages // currently each call to selectDocsByPerson will go through all docs Set<Document> docsForThisPerson = IndexUtils.selectDocsByContact(ab, docs, contactSet); for (Document d : docsForThisPerson) { if (!d.equals(d)) log.error("doc not itself!"); // if (!docsForThisPerson.contains(d)) // log.error ("Email doc is not good!"); // log.info (d); } if (log.isDebugEnabled()) { StringBuilder sb = new StringBuilder(); for (String s : emailOrNames) sb.append(s + " "); log.debug(docsForThisPerson.size() + " docs with person: [" + sb + "]"); } return docsForThisPerson; } /** * returns docs with ALL the given email/names. looks for all aliases of the * person's name, not just the given one. * runs through all docs times # given emailOrNames. */ public static Set<Document> selectDocsByAllPersons(AddressBook ab, Collection<EmailDocument> docs, String[] emailOrNames, int[] contactIds) { Set<Document> result = Util.castOrCloneAsSet((Collection) docs); if (emailOrNames != null) { for (String e : emailOrNames) { result = selectDocsByPersons(ab, (Collection) result, new String[] { e }, null); if (result.isEmpty()) return result; } } if (contactIds != null) { for (int c : contactIds) { result = selectDocsByPersons(ab, (Collection) result, null, new int[] { c }); if (result.isEmpty()) return result; } } return result; } /* * returns index of doc with date closest to the date startYear/startMonth/1 * startMonth is 0-based * returns -1 if no docs or invalid start month/year */ public static int getDocIdxWithClosestDate(Collection<? extends DatedDocument> docs, int startMonth, int startYear) { if (docs.size() == 0) return -1; if (startMonth < 0 || startYear < 0) return -1; long givenDate = new GregorianCalendar(startYear, startMonth, 1).getTime().getTime(); Long smallestDiff = Long.MAX_VALUE; int bestIdx = -1, idx = 0; for (DatedDocument d : docs) { long dateDiff = Math.abs(d.date.getTime() - givenDate); if (dateDiff < smallestDiff) { bestIdx = idx; smallestDiff = dateDiff; } idx++; } return bestIdx; } /* * public static Map<String, Collection<FacetItem>> * computeFacets(Collection<Document> docs, AddressBook addressBook, * GroupAssigner groupAssigner, Indexer indexer) * { * Map<String, Collection<FacetItem>> facetMap = new LinkedHashMap<String, * Collection<FacetItem>>(); * * // sentiments * if (indexer != null) * { * List<FacetItem> sentimentItems = new ArrayList<FacetItem>(); * * // rather brute-force, compute docs for all sentiments and then * intersect... * // a better way might be to process the selected messages and see which * sentiments they reflect * for (String sentiment: Sentiments.captionToQueryMap.keySet()) * { * String query = Sentiments.captionToQueryMap.get(sentiment); * List<Document> docsForTerm = new * ArrayList<Document>(indexer.docsWithPhrase(query, -1)); * docsForTerm.retainAll(docs); * String url = "sentiment=" + sentiment; * sentimentItems.add(new FacetItem(sentiment, * Sentiments.captionToQueryMap.get(sentiment), docsForTerm.size(), url)); * } * facetMap.put("Sentiments", sentimentItems); * } * * if (addressBook != null) * { * // groups * if (groupAssigner != null) * { * Map<SimilarGroup<String>, FacetItem> groupMap = new * LinkedHashMap<SimilarGroup<String>, FacetItem>(); * for (Document d: docs) * { * if (!(d instanceof EmailDocument)) * continue; * EmailDocument ed = (EmailDocument) d; * SimilarGroup<String> g = groupAssigner.getClosestGroup(ed); * if (g == null) * continue; * FacetItem f = groupMap.get(g); * if (f == null) * { * String url = "groupIdx=" + groupAssigner.getClosestGroupIdx(ed); * groupMap.put(g, new FacetItem(g.name, g.elementsToString(), 1, url)); * } * else * f.count++; * } * * facetMap.put("Groups", groupMap.values()); * } * * // people * Map<Contact, FacetItem> peopleMap = new LinkedHashMap<Contact, * FacetItem>(); * for (Document d: docs) * { * if (!(d instanceof EmailDocument)) * continue; * EmailDocument ed = (EmailDocument) d; * List<Contact> people = ed.getParticipatingContactsExceptOwn(addressBook); * for (Contact c: people) * { * String s = c.pickBestName(); * FacetItem f = peopleMap.get(c); * if (f == null) * { * String url = "person=" + c.canonicalEmail; * peopleMap.put(c, new FacetItem(s, c.toTooltip(), 1, url)); * } * else * f.count++; * } * } * facetMap.put("People", peopleMap.values()); * } * * // can do time also... locations? * * return facetMap; * } */ public static Map<Contact, DetailedFacetItem> partitionDocsByPerson(Collection<? extends Document> docs, AddressBook ab) { Map<Contact, DetailedFacetItem> result = new LinkedHashMap<Contact, DetailedFacetItem>(); Map<Contact, Pair<String, String>> tooltip_cache = new LinkedHashMap<Contact, Pair<String, String>>(); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; List<Contact> people = ed.getParticipatingContactsExceptOwn(ab); for (Contact c : people) { String s = null; String tooltip = null; Pair<String, String> p = tooltip_cache.get(c); if (p != null) { s = p.getFirst(); tooltip = p.getSecond(); } else { s = c.pickBestName(); tooltip = c.toTooltip(); if (ModeConfig.isPublicMode()) { s = Util.maskEmailDomain(s); tooltip = Util.maskEmailDomain(tooltip); } tooltip_cache.put(c, new Pair<String, String>(s, tooltip)); } DetailedFacetItem f = result.get(c); if (f == null) { //String url = "person=" + c.canonicalEmail; //String url = "contact=" + ab.getContactId(c); f = new DetailedFacetItem(s, tooltip, "contact", Integer.toString(ab.getContactId(c))); result.put(c, f); } f.addDoc(ed); } } return result; } public static Map<String, DetailedFacetItem> partitionDocsByFolder(Collection<? extends Document> docs) { Map<String, DetailedFacetItem> folderNameMap = new LinkedHashMap<String, DetailedFacetItem>(); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; String s = ed.folderName; if (s == null) continue; DetailedFacetItem f = folderNameMap.get(s); if (f == null) { f = new DetailedFacetItem(Util.filePathTail(s), s, "folder", s); folderNameMap.put(s, f); } f.addDoc(ed); } return folderNameMap; } private static Map<SimilarGroup<String>, DetailedFacetItem> partitionDocsByGroup( Collection<? extends Document> docs, GroupAssigner groupAssigner) { Map<SimilarGroup<String>, DetailedFacetItem> groupMap = new LinkedHashMap<SimilarGroup<String>, DetailedFacetItem>(); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; SimilarGroup<String> g = groupAssigner.getClosestGroup(ed); if (g == null) continue; DetailedFacetItem f = groupMap.get(g); if (f == null) { f = new DetailedFacetItem(g.name, g.elementsToString(), "groupIdx", Integer.toString(groupAssigner.getClosestGroupIdx(ed))); groupMap.put(g, f); } f.addDoc(ed); } return groupMap; } private static Map<String, DetailedFacetItem> partitionDocsByDirection(Collection<? extends Document> docs, AddressBook ab) { Map<String, DetailedFacetItem> result = new LinkedHashMap<String, DetailedFacetItem>(); DetailedFacetItem f_in = new DetailedFacetItem("Received", "Incoming messages", "direction", "in"); DetailedFacetItem f_out = new DetailedFacetItem("Sent", "Outgoing messages", "direction", "out"); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; int sent_or_received = ed.sentOrReceived(ab); // if sent_or_received = 0 => neither received nor sent. so it must be implicitly received. if (sent_or_received == 0 || (sent_or_received & EmailDocument.RECEIVED_MASK) != 0) f_in.addDoc(ed); if ((sent_or_received & EmailDocument.SENT_MASK) != 0) f_out.addDoc(ed); } if (f_in.totalCount() > 0) result.put("in", f_in); if (f_out.totalCount() > 0) result.put("out", f_out); return result; } private static Map<String, DetailedFacetItem> partitionDocsByDoNotTransfer( Collection<? extends Document> docs) { Map<String, DetailedFacetItem> result = new LinkedHashMap<String, DetailedFacetItem>(); DetailedFacetItem t = new DetailedFacetItem("Transfer", "To be transferred", "doNotTransfer", "no"); DetailedFacetItem f = new DetailedFacetItem("Do not transfer", "Not to be transferred", "doNotTransfer", "yes"); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; if (ed.doNotTransfer) f.addDoc(ed); else t.addDoc(ed); } if (f.totalCount() > 0) result.put("Do not transfer", f); if (t.totalCount() > 0) result.put("Transfer", t); return result; } private static Map<String, DetailedFacetItem> partitionDocsByTransferWithRestrictions( Collection<? extends Document> docs) { Map<String, DetailedFacetItem> result = new LinkedHashMap<String, DetailedFacetItem>(); DetailedFacetItem t = new DetailedFacetItem("Restrictions", "Transfer with restrictions", "transferWithRestrictions", "yes"); DetailedFacetItem f = new DetailedFacetItem("No restrictions", "Transfer with no restrictions", "transferWithRestrictions", "no"); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; if (ed.transferWithRestrictions) t.addDoc(ed); else f.addDoc(ed); } if (t.totalCount() > 0) result.put("Restrictions", t); if (f.totalCount() > 0) result.put("No restrictions", f); return result; } private static Map<String, DetailedFacetItem> partitionDocsByReviewed(Collection<? extends Document> docs) { Map<String, DetailedFacetItem> result = new LinkedHashMap<String, DetailedFacetItem>(); DetailedFacetItem t = new DetailedFacetItem("Reviewed", "Reviewed", "reviewed", "yes"); DetailedFacetItem f = new DetailedFacetItem("Not reviewed", "Not reviewed", "reviewed", "no"); result.put("Not reviewed", f); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; if (ed.reviewed) t.addDoc(ed); else f.addDoc(ed); } if (t.totalCount() > 0) result.put("Reviewed", t); else result.put("Not reviewed", f); return result; } /** note: attachment types are lower-cased */ private static Map<String, DetailedFacetItem> partitionDocsByAttachmentType( Collection<? extends Document> docs) { Map<String, DetailedFacetItem> result = new LinkedHashMap<String, DetailedFacetItem>(); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; List<Blob> attachments = ed.attachments; if (attachments != null) for (Blob b : attachments) { String ext = Util.getExtension(b.filename); if (ext == null) ext = "none"; ext = ext.toLowerCase(); DetailedFacetItem dfi = result.get(ext); if (dfi == null) { dfi = new DetailedFacetItem(ext, ext + " attachments", "attachmentExtension", ext); result.put(ext, dfi); } dfi.addDoc(ed); } } return result; } /** version that stores actual dates instead of just counts for each facet */ public static Map<String, Collection<DetailedFacetItem>> computeDetailedFacets(Collection<Document> docs, Archive archive) { AddressBook addressBook = archive.addressBook; GroupAssigner groupAssigner = archive.groupAssigner; Map<String, Collection<DetailedFacetItem>> facetMap = new LinkedHashMap<String, Collection<DetailedFacetItem>>(); // Note: order is important here -- the facets will be displayed in the order they are inserted in facetMap // current order: sentiments, groups, people, direction, folders /* disabling sentiment facets if (indexer != null) { List<DetailedFacetItem> sentimentItems = new ArrayList<DetailedFacetItem>(); Set<Document> docSet = new LinkedHashSet<Document>(docs); // rather brute-force, compute docs for all sentiments and then intersect... // a better way might be to process the selected messages and see which sentiments they reflect Map<String, String> captionToQueryMap; if (lexicon != null && !ModeConfig.isPublicMode()) captionToQueryMap = lexicon.getCaptionToQueryMap(docs); else captionToQueryMap = new LinkedHashMap<>(); for (String sentiment : captionToQueryMap.keySet()) { String query = captionToQueryMap.get(sentiment); Indexer.QueryOptions options = new Indexer.QueryOptions(); //options.setQueryType(Indexer.QueryType.ORIGINAL); options.setSortBy(Indexer.SortBy.RELEVANCE); // to avoid unnecessary sorting Collection<Document> docsForTerm = indexer.docsForQuery(query, options); docsForTerm.retainAll(docSet); sentimentItems.add(new DetailedFacetItem(sentiment, captionToQueryMap.get(sentiment), new ArrayList<Document>(docsForTerm), "sentiment", sentiment)); } facetMap.put("sentiments", sentimentItems); } */ Set<Document> docSet = new LinkedHashSet<Document>(docs); Map<String, Set<Document>> tagToDocs = new LinkedHashMap<String, Set<Document>>(); for (Document d : docs) { if (!Util.nullOrEmpty(d.comment)) { String tag = d.comment.toLowerCase(); Set<Document> set = tagToDocs.get(tag); if (set == null) { set = new LinkedHashSet<Document>(); tagToDocs.put(tag, set); } set.add(d); } } if (addressBook != null) { // groups if (!ModeConfig.isPublicMode() && groupAssigner != null) { Map<SimilarGroup<String>, DetailedFacetItem> groupMap = partitionDocsByGroup(docs, groupAssigner); facetMap.put("groups", groupMap.values()); } // people Map<Contact, DetailedFacetItem> peopleMap = partitionDocsByPerson(docs, addressBook); facetMap.put("correspondent", peopleMap.values()); // direction Map<String, DetailedFacetItem> directionMap = partitionDocsByDirection(docs, addressBook); if (directionMap.size() > 1) facetMap.put("direction", directionMap.values()); // flags -- provide them only if they have at least 2 types in these docs. if all docs have the same value for a particular flag, no point showing it. Map<String, DetailedFacetItem> doNotTransferMap = partitionDocsByDoNotTransfer(docs); if (doNotTransferMap.size() > 1) facetMap.put("transfer", doNotTransferMap.values()); Map<String, DetailedFacetItem> transferWithRestrictionsMap = partitionDocsByTransferWithRestrictions( docs); if (transferWithRestrictionsMap.size() > 1) facetMap.put("restrictions", transferWithRestrictionsMap.values()); Map<String, DetailedFacetItem> reviewedMap = partitionDocsByReviewed(docs); if (reviewedMap.size() > 1) facetMap.put("reviewed", reviewedMap.values()); List<DetailedFacetItem> tagItems = new ArrayList<DetailedFacetItem>(); Set<Document> unannotatedDocs = new LinkedHashSet<Document>(docSet); for (String tag : tagToDocs.keySet()) { Set<Document> docsForTag = tagToDocs.get(tag); docsForTag.retainAll(docSet); unannotatedDocs.removeAll(docsForTag); tagItems.add(new DetailedFacetItem(tag, tag, new HashSet<Document>(docsForTag), "annotation", tag)); } if (unannotatedDocs.size() > 0) tagItems.add(new DetailedFacetItem("none", "none", new HashSet<Document>(unannotatedDocs), "annotation", "" /* empty value for annotation */)); if (tagItems.size() > 1) facetMap.put("annotations", tagItems); // attachments if (!ModeConfig.isPublicMode()) { Map<String, DetailedFacetItem> attachmentTypesMap = partitionDocsByAttachmentType(docs); facetMap.put("attachment type", attachmentTypesMap.values()); } } if (!ModeConfig.isPublicMode()) { Map<String, DetailedFacetItem> folderNameMap = partitionDocsByFolder(docs); if (folderNameMap.size() > 1) facetMap.put("folders", folderNameMap.values()); } // sort so that in each topic, the heaviest facets are first for (String s : facetMap.keySet()) { Collection<DetailedFacetItem> detailedFacets = facetMap.get(s); List<DetailedFacetItem> list = new ArrayList<DetailedFacetItem>(detailedFacets); Collections.sort(list); facetMap.put(s, list); } return facetMap; } private static Pair<Date, Date> getDateRange(Collection<? extends DatedDocument> docs) { Date first = null, last = null; for (DatedDocument dd : docs) { if (dd.date == null) continue; // should not happen if (first == null) { first = last = dd.date; continue; } if (dd.date.before(first)) first = dd.date; if (dd.date.after(last)) last = dd.date; } return new Pair<Date, Date>(first, last); } public static String getDateRangeAsString(Collection<? extends DatedDocument> docs) { Pair<Date, Date> p = getDateRange(docs); Date first = p.getFirst(); Date last = p.getSecond(); String result = ""; if (first == null) result += "??"; else { Calendar c = new GregorianCalendar(); c.setTime(first); result += CalendarUtil.getDisplayMonth(c) + " " + c.get(Calendar.DAY_OF_MONTH) + ", " + c.get(Calendar.YEAR); } result += " to "; if (last == null) result += "??"; else { Calendar c = new GregorianCalendar(); c.setTime(last); result += CalendarUtil.getDisplayMonth(c) + " " + c.get(Calendar.DAY_OF_MONTH) + ", " + c.get(Calendar.YEAR); } return result; } public static <D extends DatedDocument> List<D> selectDocsByDateRange(Collection<D> c, int year, int month) { return selectDocsByDateRange(c, year, month, -1); } // date, month is 1-based NOT 0-based // if month is < 0, it is ignored public static <D extends DatedDocument> List<D> selectDocsByDateRange(Collection<D> c, int year, int month, int date) { //Calendar date is not 0 indexed: https://docs.oracle.com/javase/7/docs/api/java/util/Calendar.html#DATE --month; // adjust month to be 0 based because that's what calendar gives us boolean invalid_month = month < 0 || month > 11; boolean invalid_date = date < 1 || date > 31; List<D> result = new ArrayList<D>(); for (D d : c) { Calendar cal = new GregorianCalendar(); cal.setTime(d.date); int doc_year = cal.get(Calendar.YEAR); int doc_month = cal.get(Calendar.MONTH); int doc_date = cal.get(Calendar.DATE); if (year == doc_year && (invalid_month || month == doc_month) && (invalid_date || date == doc_date)) result.add(d); } return result; } /* * return docs in given date range (inclusive), sorted by date * month is 1-based NOT 0-based. Date is 1 based. * see calendarUtil.getDateRange specs for handling of the y/m/d fields. * if month is < 0, it is ignored, i.e. effectively 1 for the start year and * 12 for the end year * returns docs with [startDate, endDate] both inclusive */ public static List<DatedDocument> selectDocsByDateRange(Collection<DatedDocument> c, int startY, int startM, int startD, int endY, int endM, int endD) { Pair<Date, Date> p = CalendarUtil.getDateRange(startY, startM - 1, startD, endY, endM - 1, endD); Date startDate = p.getFirst(), endDate = p.getSecond(); List<DatedDocument> result = new ArrayList<>(); for (DatedDocument d : c) { //we want docs with the same date (year, month, date) or after start date SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); if (!startDate.after(d.date) && !endDate.before(d.date)) result.add(d); } // Collections.sort(result); return result; } /** * picks docs from given docs with indices from the given nums. * format for each num is: * // 2-10 or 14 i.e. a single page# or a comma separate range. * special case: "all" selects all docs */ public static List<Document> getDocNumbers(List<Document> docs, String nums[]) { List<Document> result = new ArrayList<Document>(); if (docs == null || nums == null) return result; if (nums.length == 1 && "all".equalsIgnoreCase(nums[0])) { result.addAll(docs); return result; } // format for s is: // 2-10 or 14 i.e. a single page# or a comma separate range for (String s : nums) { int startIdx = 0, endIdx = -1; try { if (s.indexOf("-") >= 0) { // page range StringTokenizer pageST = new StringTokenizer(s, "-"); startIdx = Integer.parseInt(pageST.nextToken()); endIdx = Integer.parseInt(pageST.nextToken()); } else startIdx = endIdx = Integer.parseInt(s); } catch (Exception e) { JSPHelper.log.error("Bad doc# string in query: " + s); continue; } for (int idx = startIdx; idx <= endIdx; idx++) { if (idx < 0 || idx >= docs.size()) { JSPHelper.log.error("Bad doc# " + idx + " # docs = " + docs.size() + " doc num spec = " + s); continue; } result.add(docs.get(idx)); } } return result; } /** * returns set of all blobs that have an attachment that ends in ANY one of * the given tails */ public static Set<Blob> getBlobsForAttachments(Collection<? extends Document> docs, String[] attachmentTails, BlobStore attachmentsStore) { Set<Blob> result = new LinkedHashSet<Blob>(); if (attachmentTails == null) return result; // empty results if (attachmentsStore == null) { JSPHelper.log.error("No attachments store!"); return result; } Set<String> neededAttachmentTails = new LinkedHashSet<String>(); for (String s : attachmentTails) neededAttachmentTails.add(s); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; if (ed.attachments == null) continue; for (Blob b : ed.attachments) { String url = attachmentsStore.getRelativeURL(b); String urlTail = Util.URLtail(url); if (neededAttachmentTails.contains(urlTail)) { result.add(b); } } } return result; } /** * returns set of all blobs whose type match ANY of the given extensions. * ("none" is a valid type) and matches attachments that don't have an * extension. */ public static Set<Blob> getBlobsForAttachmentTypes(Collection<? extends Document> docs, String[] attachmentTypes) { Set<Blob> result = new LinkedHashSet<Blob>(); // convert to a set for fast lookup Set<String> attachmentTypesSet = new LinkedHashSet<String>(); for (String t : attachmentTypes) attachmentTypesSet.add(t); for (Document d : docs) { if (!(d instanceof EmailDocument)) continue; EmailDocument ed = (EmailDocument) d; if (ed.attachments == null) continue; for (Blob b : ed.attachments) { String ext = Util.getExtension(b.filename); if (ext == null) ext = "none"; ext = ext.toLowerCase(); if (attachmentTypesSet.contains(ext)) { result.add(b); } } } return result; } /** * groupIdx can be -1, in which case we returns docs that are not assigned * to any group */ public static List<EmailDocument> getDocsForGroupIdx(Collection<EmailDocument> docs, AddressBook addressBook, GroupAssigner groupAssigner, int groupIdx) { List<EmailDocument> result = new ArrayList<EmailDocument>(); List<SimilarGroup<String>> groups = groupAssigner.getSelectedGroups(); for (EmailDocument ed : docs) { List<String> rawEmailAddrs = ed.getParticipatingAddrsExcept(addressBook.getOwnAddrs()); List<String> canonicalEmailAddrs = addressBook.convertToCanonicalAddrs(rawEmailAddrs); Collections.sort(canonicalEmailAddrs); Group<String> emailGroup = new Group<String>(canonicalEmailAddrs); int x = Group.bestFit(groups, emailGroup); // x can be -1, which is ok if (x == groupIdx) result.add(ed); } return result; } /** assigns colors to tags and sorts the rest by color */ public static List<CardTerm> computerTermOrderAndColors(List<CardTerm> tagList, int displayedCloudNum, Archive archive, GroupAssigner groupAssigner) throws IOException { List<CardTerm> filteredTagList = new ArrayList<CardTerm>(); filteredTagList.addAll(tagList); List<Pair<CardTerm, Integer>> pairs = new ArrayList<Pair<CardTerm, Integer>>(); List<CardTerm> result = new ArrayList<CardTerm>(); if (filteredTagList.size() == 0) return result; // sort all the tags for (CardTerm tag : filteredTagList) { IndexUtils.computeColorsForTag(archive, displayedCloudNum, tag, groupAssigner); int color = tag.bestColor(); if (color == -1) pairs.add(new Pair<CardTerm, Integer>(tag, Integer.MAX_VALUE)); // color = -1 should appear last else pairs.add(new Pair<CardTerm, Integer>(tag, color)); } Util.sortPairsBySecondElementIncreasing(pairs); for (Pair<CardTerm, Integer> pair : pairs) result.add(pair.getFirst()); return result; } private static void computeColorsForTag(Archive archive, int cloudNum, CardTerm tct, GroupAssigner groupAssigner) throws IOException { if (tct.colorWeights != null) return; // do nothing if we already computed colors for tags // hack for the fact that our terms map does not have 3-grams or more // we cap the term to 2 tokens for the purposes of looking up terms Map Map<Integer, Float> colorWeights; if (groupAssigner != null) { Indexer.QueryOptions options = new Indexer.QueryOptions(); options.setCluster(cloudNum); options.setQueryType(Indexer.QueryType.ORIGINAL); Collection<Document> list = archive.indexer.docsForQuery(tct.lookupTerm, options); colorWeights = groupAssigner.getAssignedColors(list); } else colorWeights = new LinkedHashMap<Integer, Float>(); tct.setColors(colorWeights); } /** sort by site-alpha, so e.g. all amazon links show up together */ public static void sortLinks(List<String> linksList) { Collections.sort(linksList, new Comparator<String>() { public int compare(String li1, String li2) { String site1 = Util.getTLD(li1); String site2 = Util.getTLD(li2); return site1.compareTo(site2); } }); } public static String stripExtraSpaces(String name) { String str = ""; String[] words = name.split("\\s+"); for (int wi = 0; wi < words.length; wi++) { str += words[wi]; if (wi < words.length - 1) str += " "; } return str; } /** * all suffixes of prefixes or all prefixes of suffixes. * */ public static Set<String> computeAllSubstrings(Set<String> set) { Set<String> substrs = new HashSet<String>(); for (String s : set) { substrs.addAll(computeAllPrefixes(computeAllSuffixes(s))); } return substrs; } private static List<String> computeAllSubstrings(Set<String> set, boolean sort) { Set<String> substrs = new HashSet<String>(); for (String s : set) { substrs.addAll(computeAllPrefixes(computeAllSuffixes(s))); } //sort Map<String, Integer> substrlen = new LinkedHashMap<String, Integer>(); for (String substr : substrs) substrlen.put(substr, substr.length()); List<Pair<String, Integer>> ssubstrslen = Util.sortMapByValue(substrlen); List<String> ssubstrs = new ArrayList<String>(); for (Pair<String, Integer> p : ssubstrslen) ssubstrs.add(p.getFirst()); return ssubstrs; } public static Set<String> computeAllSubstrings(String s) { s = s.replaceAll("^\\W+|\\W+$", ""); Set<String> set = new LinkedHashSet<String>(); set.add(s); return computeAllSubstrings(set); } /**@param sort in descending order of length*/ private static List<String> computeAllSubstrings(String s, boolean sort) { s = s.replaceAll("^\\W+|\\W+$", ""); Set<String> set = new LinkedHashSet<String>(); set.add(s); return computeAllSubstrings(set, sort); } private static Set<String> computeAllPrefixes(Set<String> set) { Set<String> result = new LinkedHashSet<String>(); for (String s : set) { String prefix = ""; StringTokenizer st = new StringTokenizer(s); while (st.hasMoreTokens()) { if (prefix.length() > 0) prefix += " "; prefix += st.nextToken();//.toLowerCase(); result.add(prefix); } } return result; } public static Set<String> computeAllPrefixes(String s) { Set<String> set = new LinkedHashSet<String>(); set.add(s); return computeAllPrefixes(set); } private static Set<String> computeAllSuffixes(Set<String> set) { Set<String> result = new HashSet<String>(); for (String s : set) { String suffix = ""; String[] words = s.split("\\s+"); for (int i = words.length - 1; i >= 0; i--) { if (suffix.length() > 0) suffix = " " + suffix; suffix = words[i] + suffix; result.add(suffix); } } return result; } private static Set<String> computeAllSuffixes(String s) { Set<String> set = new LinkedHashSet<String>(); set.add(s); return computeAllSuffixes(set); } /** experimental method, not used actively */ // public static Map<Integer, Collection<Collection<String>>> nameCooccurrenceInParas(Collection<EmailDocument> docs) throws IOException, GeneralSecurityException, ClassCastException, ClassNotFoundException // { // Pattern p = Pattern.compile("[1-9][0-9]*|\\d+[\\.]\\d+"); // don't want #s starting with 0, don't want numbers like 1. // // numMap not used currently // Map<String, List<String>> numMap = new LinkedHashMap<String, List<String>>(); // // Map<Integer, Collection<Collection<String>>> namesMap = new LinkedHashMap<Integer, Collection<Collection<String>>>(); // // for (EmailDocument ed: docs) // { // System.out.println ("D" + ed.docId); // // String contents = ""; // try { ed.getContents(); } // catch (ReadContentsException e) { Util.print_exception(e, log); } // // Matcher m = p.matcher(contents); // List<String> nums = new ArrayList<String>(); // while (m.find()) // { // String num = m.group(); // // try { // Integer.parseInt(num); // just to check for exception // nums.add(num); // } catch (Exception e) { // try { // Double.parseDouble(num); // just to check for exception // nums.add(num); // } // catch (Exception e1) { Util.report_exception(e1); } // } // } // // numMap.put(ed.docID, nums); // Collection<String> paras = Util.breakIntoParas(contents); // Collection<Collection<String>> docNames = new ArrayList<Collection<String>>(); // // for (String para: paras) // { // List<Pair<String, Float>> names = NER.namesFromText(para); // List<String> paraNames = new ArrayList<String>(); // for (Pair<String, ?> pair: names) // { // String name = pair.getFirst(); // name = name.replace("\n", " ").trim().toLowerCase().intern(); // paraNames.add(name); // } // docNames.add(paraNames); // } // namesMap.put(ed.getUniqueId(), docNames); // } // // return namesMap; // } public static Set<String> readCanonicalOwnNames(AddressBook ab) { // convert own names to canonical form Set<String> canonicalOwnNames = new LinkedHashSet<String>(); Set<String> ownNames = (ab != null) ? ab.getOwnNamesSet() : null; if (ownNames == null) return canonicalOwnNames; for (String s : ownNames) if (s != null) canonicalOwnNames.add(s.toLowerCase()); return canonicalOwnNames; } /** returns all languages in a set of docs */ public static Set<String> allLanguagesInDocs(Collection<? extends Document> docs) { Set<String> result = new LinkedHashSet<String>(); for (Document d : docs) if (d.languages != null) result.addAll(d.languages); if (result.size() == 0) result.add("english"); return result; } public static List<Document> selectDocsByRegex(Archive archive, Collection<Document> allDocs, String term) { List<Document> result = new ArrayList<Document>(); Pattern pattern = null; try { pattern = Pattern.compile(term); } catch (Exception e) { Util.report_exception(e); return result; } for (Document d : allDocs) { if (!Util.nullOrEmpty(d.description)) { if (pattern.matcher(d.description).find()) { // d.getHeader() will get message ID which may false match SSN patterns etc. result.add(d); continue; } } String text = archive.getContents(d, false /* full message */); if (pattern.matcher(text).find()) result.add(d); } return result; } private static Set<Document> selectDocsByContact(AddressBook ab, Collection<EmailDocument> docs, Set<String> contact_names, Set<String> contact_emails) { Set<Document> result = new LinkedHashSet<Document>(); // look up ci for given name // look up emails for ci for (EmailDocument ed : docs) { // assemble all names and emails for this messages List<String> allEmailsAndNames = ed.getAllAddrs(); allEmailsAndNames.addAll(ed.getAllNames()); // and match against the given ci for (String s : allEmailsAndNames) { if (contact_names.contains(s) || contact_emails.contains(s)) { result.add(ed); break; } } } return result; } /** * returns docs with the given email or name. looks for all aliases of the * person's name, not just the given one. * goes through ALL docs */ public static Set<Document> selectDocsByContact(AddressBook ab, Collection<EmailDocument> docs, Contact c) { if (c == null) return new LinkedHashSet<Document>(); return selectDocsByContact(ab, docs, c.names, c.emails); } private static Set<Document> selectDocsByContact(AddressBook ab, Collection<EmailDocument> docs, Set<Contact> cset) { if (cset == null) return new LinkedHashSet<Document>(); Set<String> cnames = new LinkedHashSet<String>(); Set<String> cemails = new LinkedHashSet<String>(); for (Contact c : cset) { cnames.addAll(c.names); cemails.addAll(c.emails); } return selectDocsByContact(ab, docs, cnames, cemails); } public static String canonicalizeEntity(String e) { if (e == null) return e; e = e.replaceAll("\\s\\s", " "); return e.trim().toLowerCase(); } public static void main(String args[]) { //System.out.println(query("this is a test", "testing|is|match")); List<String> substrs = computeAllSubstrings("Some. thing here", true); for (String substr : substrs) System.err.print(substr + " ::: "); System.err.println(); } static final boolean PHRASES_CAN_SPAN_EMPTY_LINE = false; // false by default, no external controls for this. // if false, empty line is treated as a sentence separator /** * NEEDS REVIEW. * removes http links etc, adds them in linkList if it is not null. removes * quoted parts of message */ public static void populateDocLinks(Document d, String text, List<LinkInfo> linkList, boolean inclQM) throws IOException { BufferedReader br = new BufferedReader(new StringReader(text)); while (true) { String line = br.readLine(); if (line == null) break; line = line.trim(); // strip links if (line.toLowerCase().contains("http:")) { StringTokenizer st = new StringTokenizer(line, " \r\n\t<>\""); // tokenize based on things likely to identify starting of link, http://... while (st.hasMoreTokens()) { String s = st.nextToken(); s = Util.stripPunctuation(s); if (s.toLowerCase().startsWith("http:")) { if (linkList != null && d != null) linkList.add(new LinkInfo(s, d)); } } } } } // normalizes newlines by getting rid of \r's public static String normalizeNewlines(String s) { if (s == null) return null; String result = s.replaceAll("\r\n", "\n"); // note: no double escapes needed, \r is directly used as a char by java, not an esc. sequence result = result.replaceAll("\r", "\n"); return result; } }