Java tutorial
/** * Copyright 2015, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.mathcs.nlp.zzz; import edu.emory.mathcs.nlp.common.collection.tuple.Pair; import edu.emory.mathcs.nlp.common.util.FileUtils; import edu.emory.mathcs.nlp.common.util.IOUtils; import edu.emory.mathcs.nlp.common.util.Joiner; import edu.emory.mathcs.nlp.component.tokenizer.EnglishTokenizer; import edu.emory.mathcs.nlp.component.tokenizer.Tokenizer; import edu.emory.mathcs.nlp.component.tokenizer.token.Token; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; import java.util.StringJoiner; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class CSVRadiology { final String[] BEFORE = { "Patient Name :", "DOB :", "SEX :", "Ordering Physician :", "Exam :", "HEAD CT", "CLINICAL", "TECHNIQUE :", "COMPARISON :", "FINDINGS :", "IMPRESSION :" }; final String[] AFTER = { "INDICATION :", "TECHNIQUE :", "COMPARISON :", "FINDINGS :", "IMPRESSION :" }; List<Pair<Pattern, String>> P_BEFORE, P_AFTER; Pattern NEW_LINE = Pattern.compile("\n"); public void categorize(String inputFile) throws Exception { CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT); List<CSVRecord> records = parser.getRecords(); StringJoiner join; CSVRecord record; for (int i = 0; i <= 500; i++) { if (i == 0) continue; record = records.get(i); join = new StringJoiner(" "); for (int j = 2; j < 7; j++) join.add(record.get(j)); System.out.println(join.toString()); } parser.close(); } public void tokenize(String inputFile, int outputStart) throws Exception { CSVParser parser = new CSVParser(IOUtils.createBufferedReader(inputFile), CSVFormat.DEFAULT); String inputPath = FileUtils.getPath(inputFile) + "/"; List<CSVRecord> records = parser.getRecords(); Tokenizer tokenizer = new EnglishTokenizer(); P_BEFORE = new ArrayList<>(); P_AFTER = new ArrayList<>(); for (String s : BEFORE) P_BEFORE.add(new Pair<>(Pattern.compile(s), "\n" + s)); for (String s : AFTER) P_AFTER.add(new Pair<>(Pattern.compile(s), s + "\n")); for (int i = 0; i < records.size(); i++) { PrintStream fout = IOUtils.createBufferedPrintStream(getOuputFilename(inputPath, i + outputStart)); for (List<Token> tokens : tokenizer.segmentize(records.get(i).get(0))) print(fout, tokens); fout.close(); } parser.close(); } String getOuputFilename(String inputPath, int index) { StringBuilder build = new StringBuilder(); build.append(inputPath); if (index < 1000) build.append(0); if (index < 100) build.append(0); if (index < 10) build.append(0); build.append(index); build.append(".txt"); return build.toString(); } void print(PrintStream fout, List<Token> tokens) { String s = Joiner.join(tokens, " "); for (Pair<Pattern, String> p : P_BEFORE) { Matcher m = p.o1.matcher(s); if (m.find()) s = m.replaceAll(p.o2); } for (Pair<Pattern, String> p : P_AFTER) { Matcher m = p.o1.matcher(s); if (m.find()) s = m.replaceAll(p.o2); } for (String t : NEW_LINE.split(s)) { t = t.trim(); if (!t.isEmpty()) fout.println(t.trim()); } } static public void main(String[] args) { // String inputFile = "/Users/jdchoi/Emory/radiology/tools/500/500-original.csv"; // String inputFile = "/Users/jdchoi/Emory/radiology/dat/radiology_report_151112_lemmon.csv"; String inputFile = "/Users/jdchoi/Emory/radiology/de-identification/1986/Remaining_1986Reports_FULL.csv"; try { CSVRadiology cvs = new CSVRadiology(); cvs.tokenize(inputFile, 500); // cvs.categorize(inputFile); } catch (Exception e) { e.printStackTrace(); } } }