Java tutorial
/** * * @author jhilanalkarawi * * * Copyright 2014 Brainspace Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * This file uses JACKSON JSON Library licensed under the apache license * http://jackson.codehaus.org/ * * * This is the main class and methods for generating a dataset, specifications * and fields parsed from the configuration file fields parsed into map, then * map populated with values according to the type of the field,then the * generated document printed out to the json or .dat file * */ package data_gen; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.FilenameFilter; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.util.*; import java.util.Scanner; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; public class Data_gen { public static String Default_DataSet_name; public static long documents_count; public static long Docments_Total_size; public static String output_dir; public static long total = 0; public static long document_size = 0; public static long x = 0; public static long x2 = 0; public static long cnt = 0; public static long count_check = 1000; public static int file_index = 0; public static HashMap<String, Object> fields = new LinkedHashMap<>(); public static HashMap<String, Object> schema_fields = new LinkedHashMap<>(); public static List<String> tx = new ArrayList<>(); public static File[] listOfFiles; public static long max = 0; public static long min = 0; public static void main(String[] args) throws FileNotFoundException, IOException { long startTime = System.nanoTime(); if (args.length < 2) { System.out.println("Usage:"); System.out.println( "java -jar \"jarfile\" [Directory of text source folder] [Dierctory of configration file]" + "\n"); System.exit(0); } String Dir = args[0]; // get text source dir from user String config_dir = args[1]; File folder = new File(Dir); if (folder.isDirectory() == false) { System.out.println("Text souce folder is not a Directory." + "\n"); System.exit(0); } if (!config_dir.endsWith(".properties") && !config_dir.endsWith(".PROPERTIES")) { System.out.println("\n" + "There was error parsing dataset parameters from configuration file, make sure you have the 4 parameters specified and the right type of file" + "\n"); System.exit(0); } listOfFiles = folder.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.toLowerCase().endsWith(".txt"); } }); if (listOfFiles.length == 0) { System.out.println("Text source folder is empty ! Have at least one .txt file there" + "\n"); System.exit(0); } System.out.println("\n"); Parse_Document_values(config_dir);// parse config file to get class attribute values document_size = Docments_Total_size / documents_count; // to get each document size max = (long) ((double) document_size * 1.8); min = (long) ((double) document_size * 0.2); schema_fields = Parse_Document_fields(config_dir); try { LineIterator it = FileUtils.lineIterator(listOfFiles[0]); while (it.hasNext()) { tx.add(it.nextLine()); } } catch (NullPointerException | FileNotFoundException e) { System.out.println("The text source file could not be found." + "\n"); System.exit(0); } new File(output_dir).mkdir(); //////////////////////////////////////////////////////////////// build json or .dat //////////////////////////////////////////////////////////////////// if (Default_DataSet_name.endsWith(".json")) { Build_json_file(config_dir, startTime); } if (Default_DataSet_name.endsWith(".dat")) { Build_dat_file(config_dir, startTime); } generate_xml(); generate_field_map(); } //////////////////////////////////////////////////////////////////// methods //////////////////////////////////////////////////////////////////// public static LinkedHashMap<String, Object> Parse_Document_fields(String config) throws FileNotFoundException, IOException { LinkedHashMap<String, Object> fields = new LinkedHashMap<>(); File file = new File(config); FileReader fr = null; BufferedReader br = null; try { fr = new FileReader(file); br = new BufferedReader(fr); while (br.ready()) { String temp = br.readLine(); if (temp.startsWith("\"")) { String[] t = temp.split("="); String trim = t[0].substring(1, t[0].length() - 1).trim(); fields.put(trim, t[1].trim()); } } br.close(); } catch (FileNotFoundException e) { //e.printStackTrace(); System.out.println("The file could not be found."); } catch (IOException e) { // e.printStackTrace(); System.out.println("There was an error reading the file."); } return fields; } public static void Parse_Document_values(String config) throws FileNotFoundException, IOException { File file = new File(config); FileReader fr = null; BufferedReader br = null; try { fr = new FileReader(file); br = new BufferedReader(fr); while (br.ready()) { String temp = br.readLine(); String[] t = temp.split("="); if (t[0].equalsIgnoreCase("output_name")) { Default_DataSet_name = t[1].trim(); } if (t[0].equalsIgnoreCase("documents_count")) { documents_count = Long.parseLong(t[1].trim()); } if (t[0].equalsIgnoreCase("dataset_size")) { Docments_Total_size = Long.parseLong(t[1].trim()); } if (t[0].equalsIgnoreCase("output_dir")) { output_dir = t[1].trim(); } } br.close(); } catch (FileNotFoundException e) { //e.printStackTrace(); System.out.println("The configration file could not be found."); System.exit(0); } catch (IOException e) { // e.printStackTrace(); System.out.println("There was an error reading the configration file."); System.exit(0); } catch (NullPointerException e) { // e.printStackTrace(); System.out.println("There was an error reading the configration file."); System.exit(0); } catch (NumberFormatException e) { // e.printStackTrace(); System.out.println("There was an error reading the configration file."); System.exit(0); } if (Default_DataSet_name == null || documents_count <= 0 || Docments_Total_size <= 0 || output_dir == null) { System.out.println("There was an error reading the configration file\n" + "make sure you have the following parametes specified EX:\n" + "documents_count=25000\n" + "dataset_size=10000000\n" + "output_name=output.json\n" + "output_dir=./dataset_one\n"); System.exit(0); } } public static String get_text_Dir() { Scanner s = new Scanner(System.in); System.out.println("Enter the data source Directory: " + "\n"); String Dir = s.nextLine().trim(); return Dir; } private static void integer_key(HashMap<String, Object> fields, String key, String value) { if (x == 0) { try { String[] int_extract = value.split("<|>"); int start = Math.abs(Integer.parseInt(int_extract[1])); x = start; } catch (NumberFormatException | NullPointerException | ArrayIndexOutOfBoundsException e) { System.out.println( "Integer.key start value is incorrect. make sure it's positive integer between <> signs in configuration file."); System.exit(0); } } if (value.startsWith("integer.key")) { String id = String.valueOf((x++)); fields.put(key, id); } } private static void seq_integer(HashMap<String, Object> fields, String key, String value) { if (x2 == 0) { try { String[] int_extract = value.split("<|>"); int start = Integer.parseInt(int_extract[1]); x2 = start; } catch (NumberFormatException | NullPointerException | ArrayIndexOutOfBoundsException e) { System.out.println( "Integer.key start value is incorrect. make sure it's positive integer between <> signs in configuration file."); System.exit(0); } } if (value.startsWith("seq.integer")) { String id = String.valueOf((x2++)); fields.put(key, id); } } private static void range_integer(HashMap<String, Object> fields, String key, String value) { int max = 0; int min = 0; Random rand = new Random(); try { String[] values = value.split(":"); String trim_val = values[1].substring(1, values[1].length() - 1); String[] range_values = trim_val.split(","); if (range_values[0].equalsIgnoreCase("min") && range_values[1].equalsIgnoreCase("max")) { min = 1; max = Integer.MAX_VALUE; } else { min = Integer.parseInt(range_values[0]); max = Integer.parseInt(range_values[1]); } String val = String.valueOf(rand.nextInt(max - min + 1) + min); fields.put(key, val); } catch (ArrayIndexOutOfBoundsException e) { System.out.println("Make sure you have the right integer range format in configration file"); System.out.println("example range:[3,30]" + "\n"); System.exit(0); } } private static void single_enum(HashMap<String, Object> fields, String key, String value) { String trim_val = value.substring(1, value.length() - 1); String[] values = trim_val.split(","); Random rand = new Random(); int r = rand.nextInt(values.length - 1); String final_value = values[r]; fields.put(key, final_value); } private static void multi_enum_json(HashMap<String, Object> fields, String key, String value) { try { String[] values = value.split(":"); String trim_val = values[1].substring(1, values[1].length() - 1); String[] possible_values = trim_val.split(","); LinkedList<String> all = new LinkedList(Arrays.asList(possible_values)); Random rand = new Random(); Random rand2 = new Random(); String result = ""; int r = rand.nextInt(possible_values.length - 1); if (r == 0) { r = 1; } String[] words = new String[r]; for (int i = 0; i < r; i++) { int r2 = rand.nextInt(all.size() - 1); words[i] = all.get(r2); all.remove(r2); } List<String> myList = new ArrayList<>(Arrays.asList(words)); fields.put(key, myList); } catch (ArrayIndexOutOfBoundsException e) { System.out.println("Make sure you have the right multi value enumeration format in configration file"); System.out.println("example: multi:[science,sport,art,literature,politics]" + "\n"); System.exit(0); } } private static void multi_enum_dat(HashMap<String, Object> fields, String key, String value) { try { String[] values = value.split(":"); String trim_val = values[1].substring(1, values[1].length() - 1); String[] possible_values = trim_val.split(","); LinkedList<String> all = new LinkedList(Arrays.asList(possible_values)); Random rand = new Random(); Random rand2 = new Random(); String result = ""; int r = rand.nextInt(possible_values.length - 1); if (r == 0) { r = 1; } String[] words = new String[r]; for (int i = 0; i < r; i++) { int r2 = rand.nextInt(all.size() - 1); words[i] = all.get(r2); all.remove(r2); } String multi_values = ""; for (int i = 0; i < words.length; i++) { multi_values += words[i] + (char) 59; } multi_values = multi_values.substring(0, multi_values.length() - 1); fields.put(key, multi_values); } catch (ArrayIndexOutOfBoundsException e) { System.out.println("Make sure you have the right multi value enumeration format in configration file"); System.out.println("example: multi:[science,sport,art,literature,politics]" + "\n"); System.exit(0); } } private static void generate_Text_json(HashMap<String, Object> fields, String key) throws IOException { String temp = ""; long doc_size = 0; Random rand = new Random(); long text_length = rand.nextInt((int) ((max - min + 1) + min)); while (doc_size < text_length) { temp += tx.remove(0) + " "; // todo: new line character beeing render as text instead of control doc_size = temp.length(); if (tx.size() == 1) { switch_file(tx); } } temp = temp.trim(); total += temp.length(); // count total documents size fields.put(key, temp); } private static void generate_Text_dat(HashMap<String, Object> fields, String key) throws IOException { String temp = ""; long doc_size = 0; Random rand = new Random(); long text_length = rand.nextInt((int) ((max - min + 1) + min)); while (doc_size < text_length) { temp += tx.remove(0) + (char) 174; // todo: new line charachter beeing render as text instead of control doc_size = temp.length(); if (tx.size() == 1) { switch_file(tx); } } temp = temp.trim(); total += temp.length(); // count total documents size fields.put(key, temp); } private static void generate_date(HashMap<String, Object> fields, String key, String value) { try { String[] values = value.split(":"); String trim_val = values[1].substring(1, values[1].length() - 1); String range[] = trim_val.split(","); for (int i = 0; i < 2; i++) { String[] tokens = range[i].split("-"); if (tokens.length == 1) { Random rand = new Random(); String month = String.valueOf(rand.nextInt(11) + 1); String day = String.valueOf(rand.nextInt(27) + 1); if (month.length() == 1) { month = "0" + month; } if (day.length() == 1) { day = "0" + day; } range[i] += "-" + month + "-" + day; } if (tokens.length == 2) { Random rand = new Random(); String day = String.valueOf(rand.nextInt(27) + 1); if (day.length() == 1) { day = "0" + day; } range[i] += "-" + day; } } //////////////////////////////////////////// range[0] += " 00:00:00"; range[1] += " 23:59:59"; long begin = Timestamp.valueOf(range[0]).getTime(); long end = Timestamp.valueOf(range[1]).getTime(); long diff = end - begin + 1; Timestamp rand = new Timestamp(begin + (long) (Math.random() * diff)); SimpleDateFormat date = new SimpleDateFormat("yyyy-M-d HH:mm:ss"); String time = date.format(rand); fields.put(key, time); } catch (ArrayIndexOutOfBoundsException e) { System.out.println("Make sure you have the right date range format in configration file"); System.out.println("example: date:[2013-04-07,2014-09]" + "\n"); System.exit(0); } } public static String get_config_Dir() { Scanner s = new Scanner(System.in); System.out.println("\n" + "Enter the name or Directory of the configration file to be used: " + "\n"); String Dir = s.nextLine().trim(); return Dir; } private static void switch_file(List<String> tx) throws IOException { if (listOfFiles.length == 1) { file_index = 0; LineIterator it = FileUtils.lineIterator(listOfFiles[file_index]); while (it.hasNext()) { tx.add(it.nextLine()); } } if (listOfFiles.length > 1) { ++file_index; LineIterator it = FileUtils.lineIterator(listOfFiles[file_index]); while (it.hasNext()) { tx.add(it.nextLine()); } if (file_index == listOfFiles.length - 1) { file_index = 0; } } } public static void generate_xml() { SchemaGenerator schema = new SchemaGenerator(schema_fields); try { String content = schema.generateSchema(); File file = new File(output_dir + "/" + "schema.xml"); if (!file.exists()) { file.createNewFile(); } FileWriter fw = new FileWriter(file.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw); bw.write(content); bw.close(); } catch (IOException e) { System.out.println("There was an error writing to the xml file." + "\n"); } } public static void generate_field_map() { try { File file = new File(output_dir + "/" + "fieldMap.xml"); if (!file.exists()) { file.createNewFile(); } FileWriter fw = new FileWriter(file.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw); bw.write("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "\n"); bw.write("<fieldMap name=\"Default Field Map\" version=\"1.0\">" + "\n"); bw.write(" <!--" + "\n"); bw.write(" There are no field maps for the Law Exporter" + "\n"); bw.write(" -->" + "\n"); bw.write("</fieldMap>" + "\n"); bw.close(); } catch (IOException e) { System.out.println("There was an error writing to the xml file." + "\n"); } } private static String generate_facet_value(String val) throws IOException { String value = ""; if (!val.endsWith("*")) { value = val; } if (val.endsWith("*")) { value = val.substring(0, val.length() - 1); } return value; } private static void Build_json_file(String config_dir, long startTime) throws IOException { File f = new File(output_dir + "/" + Default_DataSet_name); BufferedWriter wr = new BufferedWriter(new FileWriter(f)); wr.write("{\"docs\":["); wr.write("\n"); ObjectMapper objectMapper = new ObjectMapper(); //////////////////////////////////////////////////// flow control: (for loop) for number of //////////////////////////////////////////////////// documents and (while) for each field in document for (int i = 0; i <= documents_count; i++) { fields = Parse_Document_fields(config_dir); Iterator iterator = fields.keySet().iterator(); while (iterator.hasNext()) { String key = (String) iterator.next(); String v = (String) fields.get(key); String value = generate_facet_value(v); if (value.startsWith("integer.key")) { integer_key(fields, key, value); } if (value.startsWith("seq.integer")) { seq_integer(fields, key, value); } if (value.startsWith("range")) { range_integer(fields, key, value); } if (value.charAt(0) == '[') { single_enum(fields, key, value); } if (value.startsWith("multi")) { multi_enum_json(fields, key, value); } if (value.startsWith("date")) { generate_date(fields, key, value); } if (value.equals("text.key")) { generate_Text_json(fields, key); } if (value.equals("text")) { generate_Text_json(fields, key); } if (value.startsWith("(")) { String VALUE = value.substring(1, value.length() - 1); fields.put(key, VALUE); } } objectMapper.configure(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, false); String s = objectMapper.writeValueAsString(fields); wr.write(s); wr.write(",\n"); fields.clear(); if (i == count_check) { System.out.println("Number of Documents created: " + count_check); System.out.println("Reading from file: (" + listOfFiles[file_index] + ")"); System.out.println("Size of all documents so far: (" + total + ") Bytes"); System.out.println("\n"); count_check += 1000; } cnt = i; } System.out.println("Total Number of Documents created: " + cnt); System.out.println("Total size of Dataset created: " + total); wr.write("]}"); wr.flush(); wr.close(); long endTime = System.nanoTime(); long duration = endTime - startTime; System.out.println("Total execuion time: " + (double) duration / 1000000000.0 + " Seconds" + "\n"); } private static void Build_dat_file(String config_dir, long startTime) throws IOException { String header = ""; File f = new File(output_dir + "/" + Default_DataSet_name); Writer wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "ISO-8859-1")); fields = Parse_Document_fields(config_dir); Iterator iterator1 = fields.keySet().iterator(); while (iterator1.hasNext()) { String Key = (String) iterator1.next(); header += '\u00FE' + Key + '\u00FE' + '\u0014'; } wr.write(header.trim() + (char) 13 + (char) 10); //////////////////////////////////////////////////// flow control: (for loop) for number of //////////////////////////////////////////////////// documents and (while) for each field in document for (int i = 0; i <= documents_count; i++) { String Doc = ""; fields = Parse_Document_fields(config_dir); Iterator iterator = fields.keySet().iterator(); while (iterator.hasNext()) { String key = (String) iterator.next(); String v = (String) fields.get(key); String value = generate_facet_value(v); if (value.startsWith("integer.key")) { integer_key(fields, key, value); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.startsWith("seq.integer")) { seq_integer(fields, key, value); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.startsWith("range")) { range_integer(fields, key, value); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.charAt(0) == '[') { single_enum(fields, key, value); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.startsWith("multi")) { multi_enum_dat(fields, key, value); String temp = (String) (fields.get(key)); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.startsWith("date")) { generate_date(fields, key, value); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.equals("text.key")) { generate_Text_dat(fields, key); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.equals("text")) { generate_Text_dat(fields, key); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } if (value.startsWith("(")) { String VALUE = value.substring(1, value.length() - 1); fields.put(key, VALUE); String temp = (String) fields.get(key); Doc += (char) 254 + temp + (char) 254 + '\u0014'; } } wr.write(Doc.trim() + (char) 13 + (char) 10); fields.clear(); if (i == count_check) { System.out.println("Number of Documents created: " + count_check); System.out.println("Reading from file: (" + listOfFiles[file_index] + ")"); System.out.println("Size of all documents so far: (" + total + ") Bytes"); System.out.println("\n"); count_check += 1000; } cnt = i; } System.out.println("Total Number of Documents created: " + cnt); System.out.println("Total size of Dataset created: " + total); wr.close(); long endTime = System.nanoTime(); long duration = endTime - startTime; System.out.println("Total execuion time: " + (double) duration / 1000000000.0 + " Seconds" + "\n"); } }