Java tutorial
/* Copyright (c) 2013 Max Lungarella This file is part of Aips2Xml. Aips2Xml is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.maxl.java.aips2xml; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.StringReader; import java.io.StringWriter; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.CodingErrorAction; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.TreeMap; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import javax.xml.XMLConstants; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.Unmarshaller; import javax.xml.transform.OutputKeys; import javax.xml.transform.Source; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import javax.xml.validation.Validator; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.io.FileUtils; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities.EscapeMode; import org.jsoup.select.Elements; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import com.maxl.java.aips2xml.Preparations.Preparation; public class Aips2Xml { // Set by command line options (default values) private static String DB_LANGUAGE = ""; private static boolean SHOW_ERRORS = false; private static boolean SHOW_LOGS = true; private static String VERSION = "1.0.0"; private static String MED_TITLE = ""; private static boolean DOWNLOAD_ALL = true; private static boolean ZIP_XML = false; // XML and XSD files to be parsed (contains DE and FR -> needs to be extracted) private static final String FILE_MEDICAL_INFOS_XML = "./xml/aips_xml.xml"; private static final String FILE_MEDICAL_INFOS_XSD = "./xml/aips_xsd.xsd"; // Excel file to be parsed (DE = FR) private static final String FILE_PACKAGES_XLS = "./xls/swissmedic_packages_xls.xls"; // ****** ATC class xls file (DE != FR) ****** private static final String FILE_ATC_CLASSES_XLS = "./xls/wido_arz_amtl_atc_index_0113_xls.xls"; private static final String FILE_ATC_MULTI_LINGUAL_TXT = "./xls/atc_codes_multi_lingual.txt"; // ****** Refdata xml file to be parsed (DE != FR) ****** private static final String FILE_REFDATA_PHARMA_DE_XML = "./xml/refdata_pharma_de_xml.xml"; private static final String FILE_REFDATA_PHARMA_FR_XML = "./xml/refdata_pharma_fr_xml.xml"; // BAG xml file to be parsed (contains DE and FR) private static final String FILE_PREPARATIONS_XML = "./xml/bag_preparations_xml.xml"; // ****** Parse reports (DE != FR) ****** private static final String FILE_REPORT_BASE = "./reports/parse_report"; // Map to list with all the relevant information // HashMap is faster, but TreeMap is sort by the key :) private static Map<String, ArrayList<String>> package_info = new TreeMap<String, ArrayList<String>>(); // Map to String of atc classes, key is the ATC-code or any of its substrings private static Map<String, String> atc_map = new TreeMap<String, String>(); // Map to String of additional info, key is the SwissmedicNo5 private static Map<String, String> add_info_map = new TreeMap<String, String>(); // Global variables private static String mPackSection_str = ""; /** * Adds an option into the command line parser * @param optionName - the option name * @param description - option descriptiuon * @param hasValue - if set to true, --option=value, otherwise, --option is a boolean * @param isMandatory - if set to true, the option must be provided. */ @SuppressWarnings("static-access") static void addOption(Options opts, String optionName, String description, boolean hasValue, boolean isMandatory) { OptionBuilder opt = OptionBuilder.withLongOpt(optionName); opt = opt.withDescription(description); if (hasValue) opt = opt.hasArg(); if (isMandatory) opt = opt.isRequired(); opts.addOption(opt.create()); } static void commandLineParse(Options opts, String[] args) { CommandLineParser parser = new GnuParser(); try { CommandLine cmd = parser.parse(opts, args); if (cmd.hasOption("help")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("aips2xml", opts); System.exit(0); } if (cmd.hasOption("version")) { System.out.println("Version of aips2xml: " + VERSION); } if (cmd.hasOption("lang")) { if (cmd.getOptionValue("lang").equals("de")) DB_LANGUAGE = "de"; else if (cmd.getOptionValue("lang").equals("fr")) DB_LANGUAGE = "fr"; } if (cmd.hasOption("verbose")) SHOW_ERRORS = true; if (cmd.hasOption("quiet")) { SHOW_ERRORS = false; SHOW_LOGS = false; } if (cmd.hasOption("zip")) { ZIP_XML = true; } if (cmd.hasOption("alpha")) { MED_TITLE = cmd.getOptionValue("alpha"); } if (cmd.hasOption("nodown")) { DOWNLOAD_ALL = false; } } catch (ParseException e) { System.err.println("Parsing failed: " + e.getMessage()); } } public static void main(String[] args) { Options options = new Options(); addOption(options, "help", "print this message", false, false); addOption(options, "version", "print the version information and exit", false, false); addOption(options, "quiet", "be extra quiet", false, false); addOption(options, "verbose", "be extra verbose", false, false); addOption(options, "zip", "generate zip file", false, false); addOption(options, "lang", "use given language", true, false); addOption(options, "alpha", "only include titles which start with option value", true, false); addOption(options, "nodown", "no download, parse only", false, false); commandLineParse(options, args); // Download all files and save them in appropriate directories if (DOWNLOAD_ALL) { System.out.println(""); allDown(); } DateFormat df = new SimpleDateFormat("ddMMyy"); String date_str = df.format(new Date()); System.out.println(""); if (!DB_LANGUAGE.isEmpty()) { extractPackageInfo(); List<MedicalInformations.MedicalInformation> med_list = readAipsFile(); if (SHOW_LOGS) { System.out.println(""); System.out.println("- Generating xml and html files ... "); } long startTime = System.currentTimeMillis(); int counter = 0; String fi_complete_xml = ""; for (MedicalInformations.MedicalInformation m : med_list) { if (m.getLang().equals(DB_LANGUAGE) && m.getType().equals("fi")) { if (m.getTitle().startsWith(MED_TITLE)) { if (SHOW_LOGS) System.out.println(++counter + ": " + m.getTitle()); String[] html_str = extractHtmlSection(m); // html_str[0] -> registration numbers // html_str[1] -> content string String xml_str = convertHtmlToXml(m.getTitle(), html_str[1], html_str[0]); if (DB_LANGUAGE.equals("de")) { if (!html_str[0].isEmpty()) { String name = m.getTitle(); // Replace all "Sonderzeichen" name = name.replaceAll("[/%:]", "_"); writeToFile(html_str[1], "./fis/fi_de_html/", name + "_fi_de.html"); writeToFile(xml_str, "./fis/fi_de_xml/", name + "_fi_de.xml"); fi_complete_xml += (xml_str + "\n"); } } else if (DB_LANGUAGE.equals("fr")) { if (!html_str[0].isEmpty()) { String name = m.getTitle(); // Replace all "Sonderzeichen" name = name.replaceAll("[/%:]", "_"); writeToFile(html_str[1], "./fis/fi_fr_html/", name + "_fi_fr.html"); writeToFile(xml_str, "./fis/fi_fr_xml/", name + "_fi_fr.xml"); fi_complete_xml += (xml_str + "\n"); } } } } } // Add header to huge xml fi_complete_xml = addHeaderToXml(fi_complete_xml); // Dump to file if (DB_LANGUAGE.equals("de")) { writeToFile(fi_complete_xml, "./fis/", "fi_de.xml"); if (ZIP_XML) zipToFile("./fis/", "fi_de.xml"); } else if (DB_LANGUAGE.equals("fr")) { writeToFile(fi_complete_xml, "./fis/", "fi_fr.xml"); if (ZIP_XML) zipToFile("./fis/", "fi_fr.xml"); } // Move stylesheet file to ./fis/ folders try { File src = new File("./css/amiko_stylesheet.css"); File dst_de = new File("./fis/fi_de_html/"); File dst_fr = new File("./fis/fi_fr_html/"); if (src.exists()) { if (dst_de.exists()) FileUtils.copyFileToDirectory(src, dst_de); if (dst_fr.exists()) FileUtils.copyFileToDirectory(src, dst_fr); } } catch (IOException e) { // Unhandled! } if (SHOW_LOGS) { long stopTime = System.currentTimeMillis(); System.out.println("- Generated " + counter + " xml and html files in " + (stopTime - startTime) / 1000.0f + " sec"); } } System.exit(0); } static void allDown() { AllDown a = new AllDown(); a.downAipsXls(FILE_MEDICAL_INFOS_XSD, FILE_MEDICAL_INFOS_XML); a.downPackungenXls(FILE_PACKAGES_XLS); a.downSwissindexXml("DE", FILE_REFDATA_PHARMA_DE_XML); a.downSwissindexXml("FR", FILE_REFDATA_PHARMA_FR_XML); a.downPreparationsXml(FILE_PREPARATIONS_XML); } // FIX: Cannot get a text value form a numeric cell // ME static String getAnyValue(Cell part) { if (part != null) { switch (part.getCellType()) { case Cell.CELL_TYPE_BOOLEAN: return part.getBooleanCellValue() + ""; case Cell.CELL_TYPE_NUMERIC: return part.getNumericCellValue() + ""; case Cell.CELL_TYPE_STRING: return part.getStringCellValue() + ""; case Cell.CELL_TYPE_BLANK: return "BLANK"; case Cell.CELL_TYPE_ERROR: return "ERROR"; case Cell.CELL_TYPE_FORMULA: return "FORMEL"; } } return ""; } static void extractPackageInfo() { try { long startTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.print("- Processing packages xls ... "); // Load Swissmedic xls file FileInputStream packages_file = new FileInputStream(FILE_PACKAGES_XLS); // Get workbook instance for XLS file (HSSF = Horrible SpreadSheet Format) HSSFWorkbook packages_workbook = new HSSFWorkbook(packages_file); // Get first sheet from workbook HSSFSheet packages_sheet = packages_workbook.getSheetAt(0); // Iterate through all rows of first sheet Iterator<Row> rowIterator = packages_sheet.iterator(); int num_rows = 0; while (rowIterator.hasNext()) { Row row = rowIterator.next(); if (num_rows > 3) { String swissmedic_no5 = ""; // SwissmedicNo5 registration number (5 digits) String sequence_name = ""; String package_id = ""; String swissmedic_no8 = ""; // SwissmedicNo8 = SwissmedicNo5 + Package id (8 digits) String heilmittel_code = ""; String package_size = ""; String package_unit = ""; String swissmedic_cat = ""; String application_area = ""; String public_price = ""; String exfactory_price = ""; String therapeutic_index = ""; String withdrawn_str = ""; String speciality_str = ""; String plimitation_str = ""; String add_info_str = ""; // Contains additional information separated by ; // 0: Zulassungsnnr, 1: Sequenz, 2: Sequenzname, 3: Zulassunginhaberin, 4: T-Nummer, 5: ATC-Code, 6: Heilmittelcode // 7: Erstzulassung Prparat, 8: Zulassungsdatum Sequenz, 9: Gltigkeitsdatum, 10: Verpackung, 11: Packungsgrsse // 12: Einheit, 13: Abgabekategorie, 14: Wirkstoff, 15: Zusammensetzung, 16: Anwendungsgebiet Prparat, 17: Anwendungsgebiet Sequenz swissmedic_no5 = getAnyValue(row.getCell(0)); // Swissmedic registration number (5 digits) sequence_name = getAnyValue(row.getCell(2)); // Sequence name heilmittel_code = getAnyValue(row.getCell(6)); package_size = getAnyValue(row.getCell(11)); package_unit = getAnyValue(row.getCell(12)); swissmedic_cat = getAnyValue(row.getCell(13)); application_area = getAnyValue(row.getCell(16)); if (row.getCell(10) != null) { package_id = getAnyValue(row.getCell(10)); swissmedic_no8 = swissmedic_no5 + package_id; // Fill in row ArrayList<String> pack = new ArrayList<String>(); pack.add(swissmedic_no5); // 0 pack.add(sequence_name); // 1 pack.add(heilmittel_code); // 2 pack.add(package_size); // 3 pack.add(package_unit); // 4 pack.add(swissmedic_cat); // 5 if (!application_area.isEmpty()) pack.add(application_area + " (Swissmedic)\n"); // 6 = swissmedic + bag else pack.add(""); pack.add(public_price); // 7 pack.add(exfactory_price); // 8 pack.add(therapeutic_index); // 9 pack.add(withdrawn_str); // 10 pack.add(speciality_str); // 11 pack.add(plimitation_str); // 12 pack.add(add_info_str); // 13 package_info.put(swissmedic_no8, pack); } } num_rows++; } long stopTime = System.currentTimeMillis(); if (SHOW_LOGS) { System.out.println( (package_info.size() + 1) + " packages in " + (stopTime - startTime) / 1000.0f + " sec"); } startTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.print("- Processing atc classes xls ... "); if (DB_LANGUAGE.equals("de")) { // Load ATC classes xls file FileInputStream atc_classes_file = new FileInputStream(FILE_ATC_CLASSES_XLS); // Get workbook instance for XLS file (HSSF = Horrible SpreadSheet Format) HSSFWorkbook atc_classes_workbook = new HSSFWorkbook(atc_classes_file); // Get first sheet from workbook HSSFSheet atc_classes_sheet = atc_classes_workbook.getSheetAt(1); // Iterate through all rows of first sheet rowIterator = atc_classes_sheet.iterator(); num_rows = 0; while (rowIterator.hasNext()) { Row row = rowIterator.next(); if (num_rows > 2) { String atc_code = ""; String atc_class = ""; if (row.getCell(0) != null) { atc_code = row.getCell(0).getStringCellValue().replaceAll("\\s", ""); } if (row.getCell(2) != null) { atc_class = row.getCell(2).getStringCellValue(); } // Build a full map atc code to atc class if (atc_code.length() > 0) { atc_map.put(atc_code, atc_class); } } num_rows++; } } else if (DB_LANGUAGE.equals("fr")) { // Load multilinguagl ATC classes txt file String atc_classes_multi = readFromFile(FILE_ATC_MULTI_LINGUAL_TXT); // Loop through all lines Scanner scanner = new Scanner(atc_classes_multi); while (scanner.hasNextLine()) { String line = scanner.nextLine(); List<String> atc_class = Arrays.asList(line.split(": ")); String atc_code = atc_class.get(0); String[] atc_classes_str = atc_class.get(1).split(";"); String atc_class_french = atc_classes_str[1].trim(); atc_map.put(atc_code, atc_class_french); } scanner.close(); } stopTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out .println((atc_map.size() + 1) + " classes in " + (stopTime - startTime) / 1000.0f + " sec"); // Load Refdata xml file File refdata_xml_file = null; if (DB_LANGUAGE.equals("de")) refdata_xml_file = new File(FILE_REFDATA_PHARMA_DE_XML); else if (DB_LANGUAGE.equals("fr")) refdata_xml_file = new File(FILE_REFDATA_PHARMA_FR_XML); else { System.err.println("ERROR: DB_LANGUAGE undefined"); System.exit(1); } FileInputStream refdata_fis = new FileInputStream(refdata_xml_file); startTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.print("- Unmarshalling Refdata Pharma " + DB_LANGUAGE + " ... "); JAXBContext context = JAXBContext.newInstance(Pharma.class); Unmarshaller um = context.createUnmarshaller(); Pharma refdataPharma = (Pharma) um.unmarshal(refdata_fis); List<Pharma.ITEM> pharma_list = refdataPharma.getItem(); String smno8; for (Pharma.ITEM pharma : pharma_list) { String ean_code = pharma.getGtin(); if (ean_code.length() == 13) { smno8 = ean_code.substring(4, 12); // Extract pharma corresponding to swissmedicno8 ArrayList<String> pi_row = package_info.get(smno8); // Replace sequence_name if (pi_row != null) { if (pharma.getAddscr().length() > 0) pi_row.set(1, pharma.getDscr() + ", " + pharma.getAddscr()); else pi_row.set(1, pharma.getDscr()); if (pharma.getStatus().equals("I")) { if (DB_LANGUAGE.equals("de")) pi_row.set(10, "a.H."); else if (DB_LANGUAGE.equals("fr")) pi_row.set(10, "p.c."); } } else { if (SHOW_ERRORS) System.err.println(">> Does not exist in BAG xls: " + smno8 + " (" + pharma.getDscr() + ", " + pharma.getAddscr() + ")"); } } else if (ean_code.length() < 13) { if (SHOW_ERRORS) System.err.println(">> EAN code too short: " + ean_code + ": " + pharma.getDscr()); } else if (ean_code.length() > 13) { if (SHOW_ERRORS) System.err.println(">> EAN code too long: " + ean_code + ": " + pharma.getDscr()); } } stopTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.println(pharma_list.size() + " medis in " + (stopTime - startTime) / 1000.0f + " sec"); // Load BAG xml file File bag_xml_file = new File(FILE_PREPARATIONS_XML); FileInputStream fis_bag = new FileInputStream(bag_xml_file); startTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.print("- Processing preparations xml ... "); context = JAXBContext.newInstance(Preparations.class); um = context.createUnmarshaller(); Preparations prepInfos = (Preparations) um.unmarshal(fis_bag); List<Preparations.Preparation> prep_list = prepInfos.getPreparations(); int num_preparations = 0; for (Preparations.Preparation prep : prep_list) { String swissmedicno5_str = prep.getSwissmedicNo5(); if (swissmedicno5_str != null) { String orggencode_str = ""; // "O", "G" or empty -> "" String flagSB20_str = ""; // "Y" -> 20% or "N" -> 10% if (prep.getOrgGenCode() != null) orggencode_str = prep.getOrgGenCode(); if (prep.getFlagSB20() != null) { flagSB20_str = prep.getFlagSB20(); if (flagSB20_str.equals("Y")) { if (DB_LANGUAGE.equals("de")) flagSB20_str = "SB 20%"; else if (DB_LANGUAGE.equals("fr")) flagSB20_str = "QP 20%"; } else if (flagSB20_str.equals("N")) { if (DB_LANGUAGE.equals("de")) flagSB20_str = "SB 10%"; else if (DB_LANGUAGE.equals("fr")) flagSB20_str = "QP 10%"; } else flagSB20_str = ""; } add_info_map.put(swissmedicno5_str, orggencode_str + ";" + flagSB20_str); } List<Preparation.Packs> packs_list = prep.getPacks(); for (Preparation.Packs packs : packs_list) { // Extract codes for therapeutic index / classification String bag_application = ""; String therapeutic_code = ""; List<Preparations.Preparation.ItCodes> itcode_list = prep.getItCodes(); for (Preparations.Preparation.ItCodes itc : itcode_list) { List<Preparations.Preparation.ItCodes.ItCode> code_list = itc.getItCode(); int index = 0; for (Preparations.Preparation.ItCodes.ItCode code : code_list) { if (index == 0) { if (DB_LANGUAGE.equals("de")) therapeutic_code = code.getDescriptionDe(); else if (DB_LANGUAGE.equals("fr")) therapeutic_code = code.getDescriptionFr(); } else { if (DB_LANGUAGE.equals("de")) bag_application = code.getDescriptionDe(); else if (DB_LANGUAGE.equals("fr")) bag_application = code.getDescriptionFr(); } index++; } } // Generate new package info List<Preparation.Packs.Pack> pack_list = packs.getPack(); for (Preparation.Packs.Pack pack : pack_list) { // Get SwissmedicNo8 and used it as a key to extract all the relevant package info String swissMedicNo8 = pack.getSwissmedicNo8(); ArrayList<String> pi_row = package_info.get(swissMedicNo8); // Preparation also in BAG xml file (we have a price) if (pi_row != null) { // Update Swissmedic catory if necessary ("N->A", Y->"A+") if (pack.getFlagNarcosis().equals("Y")) pi_row.set(5, pi_row.get(5) + "+"); // Extract point limitations List<Preparations.Preparation.Packs.Pack.PointLimitations> point_limits = pack .getPointLimitations(); for (Preparations.Preparation.Packs.Pack.PointLimitations limits : point_limits) { List<Preparations.Preparation.Packs.Pack.PointLimitations.PointLimitation> plimits_list = limits .getPointLimitation(); if (plimits_list.size() > 0) if (plimits_list.get(0) != null) pi_row.set(12, ", LIM" + plimits_list.get(0).getPoints() + ""); } // Extract exfactory and public prices List<Preparations.Preparation.Packs.Pack.Prices> price_list = pack.getPrices(); for (Preparations.Preparation.Packs.Pack.Prices price : price_list) { List<Preparations.Preparation.Packs.Pack.Prices.PublicPrice> public_price = price .getPublicPrice(); List<Preparations.Preparation.Packs.Pack.Prices.ExFactoryPrice> exfactory_price = price .getExFactoryPrice(); if (exfactory_price.size() > 0) { try { float f = Float.valueOf(exfactory_price.get(0).getPrice()); String ep = String.format("%.2f", f); pi_row.set(8, "CHF " + ep); } catch (NumberFormatException e) { if (SHOW_ERRORS) System.err.println("Number format exception (exfactory price): " + swissMedicNo8 + " (" + public_price.size() + ")"); } } if (public_price.size() > 0) { try { float f = Float.valueOf(public_price.get(0).getPrice()); String pp = String.format("%.2f", f); pi_row.set(7, "CHF " + pp); if (DB_LANGUAGE.equals("de")) pi_row.set(11, ", SL"); else if (DB_LANGUAGE.equals("fr")) pi_row.set(11, ", LS"); } catch (NumberFormatException e) { if (SHOW_ERRORS) System.err.println("Number format exception (public price): " + swissMedicNo8 + " (" + public_price.size() + ")"); } } // Add application area and therapeutic code if (!bag_application.isEmpty()) pi_row.set(6, pi_row.get(6) + bag_application + " (BAG)"); pi_row.set(9, therapeutic_code); } } } } num_preparations++; } stopTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.println( num_preparations + " preparations in " + (stopTime - startTime) / 1000.0f + " sec"); // Loop through all SwissmedicNo8 numbers for (Map.Entry<String, ArrayList<String>> entry : package_info.entrySet()) { String swissmedicno8 = entry.getKey(); ArrayList<String> pi_row = entry.getValue(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (JAXBException e) { e.printStackTrace(); } } static List<MedicalInformations.MedicalInformation> readAipsFile() { List<MedicalInformations.MedicalInformation> med_list = null; try { JAXBContext context = JAXBContext.newInstance(MedicalInformations.class); // Validation SchemaFactory sf = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); Schema schema = sf.newSchema(new File(FILE_MEDICAL_INFOS_XSD)); Validator validator = schema.newValidator(); validator.setErrorHandler(new MyErrorHandler()); // Marshaller /* Marshaller ma = context.createMarshaller(); ma.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE); MedicalInformations medi_infos = new MedicalInformations(); ma.marshal(medi_infos, System.out); */ // Unmarshaller long startTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.print("- Unmarshalling Swissmedic xml ... "); FileInputStream fis = new FileInputStream(new File(FILE_MEDICAL_INFOS_XML)); Unmarshaller um = context.createUnmarshaller(); MedicalInformations med_infos = (MedicalInformations) um.unmarshal(fis); med_list = med_infos.getMedicalInformation(); long stopTime = System.currentTimeMillis(); if (SHOW_LOGS) System.out.println(med_list.size() + " medis in " + (stopTime - startTime) / 1000.0f + " sec"); } catch (IOException e) { e.printStackTrace(); } catch (JAXBException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } return med_list; } static String[] extractHtmlSection(MedicalInformations.MedicalInformation m) { // Extract section titles and section ids MedicalInformations.MedicalInformation.Sections med_sections = m.getSections(); List<MedicalInformations.MedicalInformation.Sections.Section> med_section_list = med_sections.getSection(); Document doc = Jsoup.parse(m.getContent()); doc.outputSettings().escapeMode(EscapeMode.xhtml); // Clean html code HtmlUtils html_utils = new HtmlUtils(m.getContent()); html_utils.clean(); // Extract registration number (swissmedic no5) String regnr_str = ""; if (DB_LANGUAGE.equals("de")) regnr_str = html_utils.extractRegNrDE(m.getTitle()); else if (DB_LANGUAGE.equals("fr")) regnr_str = html_utils.extractRegNrFR(m.getTitle()); // Sanitize html String html_sanitized = ""; // First check for bad boys (version=1! but actually version>1!) if (!m.getVersion().equals("1") || m.getContent().substring(0, 20).contains("xml")) { for (int i = 1; i < 22; ++i) { html_sanitized += html_utils.sanitizeSection(i, m.getTitle(), DB_LANGUAGE); } html_sanitized = "<div id=\"monographie\">" + html_sanitized + "</div>"; } else { html_sanitized = m.getContent(); } // Update "Packungen" section and extract therapeutisches index List<String> mTyIndex_list = new ArrayList<String>(); String mContent_str = updateSectionPackungen(m.getTitle(), package_info, regnr_str, html_sanitized, mTyIndex_list); // Add meta-tag and link mContent_str = mContent_str.replaceAll("<head>", "<head>" + "<link href=\"amiko_stylesheet.css\" rel=\"stylesheet\" type=\"text/css\"></>" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"); m.setContent(mContent_str); // Fix problem with wrong div class in original Swissmedic file if (DB_LANGUAGE.equals("de")) { m.setStyle(m.getStyle().replaceAll("untertitel", "untertitle")); m.setStyle(m.getStyle().replaceAll("untertitel1", "untertitle1")); } // Correct formatting error introduced by Swissmedic m.setAuthHolder(m.getAuthHolder().replaceAll("&", "&")); // Extracts only *first* registration number /* List<String> swissmedicno5_list = Arrays.asList(regnr_str.split("\\s*,\\s*")); String[] swno5_content_map = {swissmedicno5_list.get(0), mContent_str}; */ // Extract *all* registration numbers String[] swno5_content_map = { regnr_str, mContent_str }; return swno5_content_map; //mContent_str; } static String updateSectionPackungen(String title, Map<String, ArrayList<String>> pack_info, String regnr_str, String content_str, List<String> tIndex_list) { Document doc = Jsoup.parse(content_str, "UTF-16"); List<String> pinfo_str = new ArrayList<String>(); int index = 0; // Extract swissmedicno5 registration numbers List<String> swissmedicno5_list = Arrays.asList(regnr_str.split("\\s*,\\s*")); for (String s : swissmedicno5_list) { // Extract original / generika info + Selbstbehalt info from "add_info_map" String orggen_str = ""; String flagsb_str = ""; String addinfo_str = add_info_map.get(s); if (addinfo_str != null) { List<String> ai_list = Arrays.asList(addinfo_str.split("\\s*;\\s*")); if (ai_list != null) { if (!ai_list.get(0).isEmpty()) orggen_str = ", " + ai_list.get(0); if (!ai_list.get(1).isEmpty()) flagsb_str = ", " + ai_list.get(1); } } // Now generate many swissmedicno8 = swissmedicno5 + ***, check if they're keys and retrieve package info String swissmedicno8_key = ""; for (int n = 0; n < 1000; ++n) { if (n < 10) swissmedicno8_key = s + String.valueOf(n).format("00%d", n); else if (n < 100) swissmedicno8_key = s + String.valueOf(n).format("0%d", n); else swissmedicno8_key = s + String.valueOf(n).format("%d", n); // Check if swissmedicno8_key is a key of the map if (pack_info.containsKey(swissmedicno8_key)) { ArrayList<String> pi_row = package_info.get(swissmedicno8_key); if (pi_row != null) { // --> Add "ausser Handel" information String withdrawn_str = ""; if (pi_row.get(10).length() > 0) withdrawn_str = ", " + pi_row.get(10); // --> Add public price information if (pi_row.get(7).length() > 0) { // Remove double spaces in title String medtitle = capitalizeFully(pi_row.get(1).replaceAll("\\s+", " "), 1); // Remove [QAP?] -> not an easy one! medtitle = medtitle.replaceAll("\\[(.*?)\\?\\] ", ""); pinfo_str.add("<p class=\"spacing1\">" + medtitle + ", " + pi_row.get(7) + withdrawn_str + " [" + pi_row.get(5) + pi_row.get(11) + pi_row.get(12) + flagsb_str + orggen_str + "]</p>"); } else { // Remove double spaces in title String medtitle = capitalizeFully(pi_row.get(1).replaceAll("\\s+", " "), 1); // Remove [QAP?] -> not an easy one! medtitle = medtitle.replaceAll("\\[(.*?)\\?\\] ", ""); if (DB_LANGUAGE.equals("de")) { pinfo_str.add("<p class=\"spacing1\">" + medtitle + ", " + "k.A." + withdrawn_str + " [" + pi_row.get(5) + pi_row.get(11) + pi_row.get(12) + flagsb_str + orggen_str + "]</p>"); } else if (DB_LANGUAGE.equals("fr")) { pinfo_str.add("<p class=\"spacing1\">" + medtitle + ", " + "prix n.s." + withdrawn_str + " [" + pi_row.get(5) + pi_row.get(11) + pi_row.get(12) + flagsb_str + orggen_str + "]</p>"); } } // --> Add "tindex_str" and "application_str" (see SqlDatabase.java) if (index == 0) { tIndex_list.add(pi_row.get(9)); // therapeutic index tIndex_list.add(pi_row.get(6)); // application area index++; } } } } } // In case the pinfo_str is empty due to malformed XML /* if (pinfo_str.isEmpty()) html_utils.extractPackSection(); */ // In case nothing was found if (index == 0) { tIndex_list.add(""); tIndex_list.add(""); } // Replace original package information with pinfo_str String p_str = ""; mPackSection_str = ""; for (String p : pinfo_str) { p_str += p; } // Generate a html-deprived string file mPackSection_str = p_str.replaceAll("\\<p.*?\\>", ""); mPackSection_str = mPackSection_str.replaceAll("<\\/p\\>", "\n"); // Remove last \n if (mPackSection_str.length() > 0) mPackSection_str = mPackSection_str.substring(0, mPackSection_str.length() - 1); doc.outputSettings().escapeMode(EscapeMode.xhtml); Element div7800 = doc.select("[id=Section7800]").first(); if (div7800 != null) { div7800.html("<div class=\"absTitle\">Packungen</div>" + p_str); } else { Element div18 = doc.select("[id=section18]").first(); if (div18 != null) { div18.html("<div class=\"absTitle\">Packungen</div>" + p_str); } else { if (SHOW_ERRORS) System.err.println(">> ERROR: elem is null, sections 18/7800 does not exist: " + title); } } return doc.html(); } static String convertHtmlToXml(String med_title, String html_str, String regnr_str) { Document mDoc = Jsoup.parse(html_str); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // <div id="monographie"> -> <fi> mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id"); // <div class="MonTitle"> -> <title> mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id"); // Beautify the title to the best of my possibilities ... still not good enough! String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+", ""); if (!title_str.equals(med_title)) if (SHOW_ERRORS) System.err.println(med_title + " differs from " + title_str); // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good! mDoc.select("title").first().text(med_title); // <div class="ownerCompany"> -> <owner> Element owner_elem = mDoc.select("div[class=ownerCompany]").first(); if (owner_elem != null) { owner_elem.tagName("owner").removeAttr("class"); String owner_str = mDoc.select("owner").text(); mDoc.select("owner").first().text(owner_str); } else { mDoc.select("title").after("<owner></owner>"); if (DB_LANGUAGE.equals("de")) mDoc.select("owner").first().text("k.A."); else if (DB_LANGUAGE.equals("fr")) mDoc.select("owner").first().text("n.s."); } // <div class="paragraph"> -> <paragraph> mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id"); // <div class="absTitle"> -> <paragraphTitle> mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class"); // <div class="untertitle1"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="untertitle"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="shortCharacteristic"> -> <characteristic> mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class"); // <div class="image"> mDoc.select("div[class=image]").tagName("image").removeAttr("class"); // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p> mDoc.select("p[class]").tagName("p").removeAttr("class"); // <span style="font-style:italic"> -> <i> mDoc.select("span").tagName("i").removeAttr("style"); // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> mDoc.select("i[class=indention1]").tagName("i").removeAttr("class"); mDoc.select("i[class=indention2]").tagName("i").removeAttr("class"); // mDoc.select("p").select("i").tagName("i"); // mDoc.select("paragraphtitle").select("i").tagName("para-i"); // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i"); Elements elems = mDoc.select("paragraphtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } elems = mDoc.select("paragraphsubtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } // Here we take care of tables // <table class="s21"> -> <table> mDoc.select("table[class]").removeAttr("class"); mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border"); mDoc.select("colgroup").remove(); mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan"); mDoc.select("tr").removeAttr("class"); elems = mDoc.select("div[class]"); for (Element e : elems) { if (e.text().isEmpty()) e.remove(); } mDoc.select("tbody").unwrap(); // Remove nested table (a nasty table-in-a-table Elements nested_table = mDoc.select("table").select("tr").select("td").select("table"); if (!nested_table.isEmpty()) { nested_table.select("table").unwrap(); } // Here we take care of the images mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border"); // Subs and sups mDoc.select("sub[class]").tagName("sub").removeAttr("class"); mDoc.select("sup[class]").tagName("sup").removeAttr("class"); mDoc.select("td").select("sub").tagName("td-sub"); mDoc.select("td").select("sup").tagName("td-sup"); // Remove floating <td-sup> tags mDoc.select("p").select("td-sup").tagName("sup"); mDoc.select("p").select("td-sub").tagName("sub"); // Box mDoc.select("div[class=box]").tagName("box").removeAttr("class"); // Insert swissmedicno5 after <owner> tag mDoc.select("owner").after("<swissmedicno5></swissmedicno5"); mDoc.select("swissmedicno5").first().text(regnr_str); // Remove html, head and body tags String xml_str = mDoc.select("body").first().html(); //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", ""); xml_str = xml_str.replaceAll("<sup> </sup>", ""); xml_str = xml_str.replaceAll("<sub> </sub>", ""); xml_str = xml_str.replaceAll("<p> <i>", "<p><i>"); xml_str = xml_str.replaceAll("</p> </td>", "</p></td>"); xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!! xml_str = xml_str.replaceAll("", "- "); xml_str = xml_str.replaceAll("<br />", ""); xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", ""); // Remove multiple instances of <p></p> Scanner scanner = new Scanner(xml_str); String new_xml_str = ""; int counter = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.trim().equals("<p></p>")) { counter++; } else counter = 0; if (counter < 3) new_xml_str += line; } scanner.close(); return new_xml_str; } static String addHeaderToXml(String xml_str) { Document mDoc = Jsoup.parse("<kompendium>\n" + xml_str + "</kompendium>"); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // Add date Date df = new Date(); String date_str = df.toString(); mDoc.select("kompendium").first().prependElement("date"); mDoc.select("date").first().text(date_str); // Add language mDoc.select("date").after("<lang></lang>"); if (DB_LANGUAGE.equals("de")) mDoc.select("lang").first().text("DE"); else if (DB_LANGUAGE.equals("fr")) mDoc.select("lang").first().text("FR"); // Fool jsoup.parse which seems to have its own "life" mDoc.select("tbody").unwrap(); Elements img_elems = mDoc.select("img"); for (Element img_e : img_elems) { if (!img_e.hasAttr("src")) img_e.unwrap(); } mDoc.select("img").tagName("image"); String final_xml_str = mDoc.select("kompendium").first().outerHtml(); return final_xml_str; } static String prettyFormat(String input) { try { Source xmlInput = new StreamSource(new StringReader(input)); StringWriter stringWriter = new StringWriter(); StreamResult xmlOutput = new StreamResult(stringWriter); Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4"); transformer.transform(xmlInput, xmlOutput); return xmlOutput.getWriter().toString(); } catch (Exception e) { throw new RuntimeException(e); // simple exception handling, please review it } } static String capitalizeFully(String s, int N) { // Split string String[] tokens = s.split("\\s"); // Capitalize only first word! tokens[0] = tokens[0].toUpperCase(); // Reassemble string String full_s = ""; if (tokens.length > 1) { for (int i = 0; i < tokens.length - 1; i++) { full_s += (tokens[i] + " "); } full_s += tokens[tokens.length - 1]; } else { full_s = tokens[0]; } return full_s; } static String readFromFile(String filename) { String file_str = ""; try { FileInputStream fis = new FileInputStream(filename); BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String line; while ((line = br.readLine()) != null) { file_str += (line + "\n"); } br.close(); } catch (Exception e) { System.err.println(">> Error in reading file"); } return file_str; } static void writeToFile(String string_to_write, String dir_name, String file_name) { try { File wdir = new File(dir_name); if (!wdir.exists()) wdir.mkdirs(); File wfile = new File(dir_name + file_name); if (!wfile.exists()) wfile.createNewFile(); // FileWriter fw = new FileWriter(wfile.getAbsoluteFile()); CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder(); encoder.onMalformedInput(CodingErrorAction.REPORT); encoder.onUnmappableCharacter(CodingErrorAction.REPORT); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(wfile.getAbsoluteFile()), encoder); BufferedWriter bw = new BufferedWriter(osw); bw.write(string_to_write); bw.close(); } catch (IOException e) { e.printStackTrace(); } } static void zipToFile(String dir_name, String file_name) { byte[] buffer = new byte[1024]; try { FileOutputStream fos = new FileOutputStream(dir_name + changeExtension(file_name, "zip")); ZipOutputStream zos = new ZipOutputStream(fos); ZipEntry ze = new ZipEntry(file_name); zos.putNextEntry(ze); FileInputStream in = new FileInputStream(dir_name + file_name); int len = 0; while ((len = in.read(buffer)) > 0) { zos.write(buffer, 0, len); } in.close(); zos.closeEntry(); zos.close(); } catch (IOException e) { e.printStackTrace(); } } static String changeExtension(String orig_name, String new_extension) { int last_dot = orig_name.lastIndexOf("."); if (last_dot != -1) return orig_name.substring(0, last_dot) + "." + new_extension; else return orig_name + "." + new_extension; } static class MyErrorHandler implements ErrorHandler { public void warning(SAXParseException exception) throws SAXException { System.out.println("\nWARNING"); exception.printStackTrace(); } public void error(SAXParseException exception) throws SAXException { System.out.println("\nERROR"); exception.printStackTrace(); } public void fatalError(SAXParseException exception) throws SAXException { System.out.println("\nFATAL ERROR"); exception.printStackTrace(); } } }