List of usage examples for javax.xml.parsers DocumentBuilder parse
public abstract Document parse(InputSource is) throws SAXException, IOException;
From source file:DOMEdit.java
static public void main(String[] arg) { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(true);// w ww.j a v a 2s. c o m dbf.setNamespaceAware(true); dbf.setIgnoringElementContentWhitespace(true); Document doc = null; try { DocumentBuilder builder = dbf.newDocumentBuilder(); builder.setErrorHandler(new MyErrorHandler()); InputSource is = new InputSource("personWithDTD.xml"); doc = builder.parse(is); append(doc, "newName", "1111111111", "newEmail"); write(doc); } catch (Exception e) { System.err.println(e); } }
From source file:Main.java
public static void main(String args[]) { DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); builderFactory.setNamespaceAware(true); // Set namespace aware builderFactory.setValidating(true); // and validating parser feaures builderFactory.setIgnoringElementContentWhitespace(true); DocumentBuilder builder = null; try {//from w w w . j a va2 s .c o m builder = builderFactory.newDocumentBuilder(); // Create the parser } catch (ParserConfigurationException e) { e.printStackTrace(); } Document xmlDoc = null; try { xmlDoc = builder.parse(new InputSource(new StringReader(xmlString))); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } DocumentType doctype = xmlDoc.getDoctype(); if (doctype == null) { System.out.println("DOCTYPE is null"); } else { System.out.println("DOCTYPE node:\n" + doctype.getInternalSubset()); } System.out.println("\nDocument body contents are:"); listNodes(xmlDoc.getDocumentElement(), ""); // Root element & children }
From source file:OldExtractor.java
/** * @param args the command line arguments *///from w ww . j a v a 2s. co m public static void main(String[] args) { // TODO code application logic here String bingUrl = "https://api.datamarket.azure.com/Bing/Search/Web?$top=10&$format=Atom&Query=%27gates%27"; //Provide your account key here. String accountKey = "ghTYY7wD6LpyxUO9VRR7e1f98WFhHWYERMcw87aQTqQ"; // String accountKey = "xqbCjT87/MQz25JWdRzgMHdPkGYnOz77IYmP5FUIgC8"; byte[] accountKeyBytes = Base64.encodeBase64((accountKey + ":" + accountKey).getBytes()); String accountKeyEnc = new String(accountKeyBytes); try { URL url = new URL(bingUrl); URLConnection urlConnection = url.openConnection(); urlConnection.setRequestProperty("Authorization", "Basic " + accountKeyEnc); InputStream inputStream = (InputStream) urlConnection.getContent(); byte[] contentRaw = new byte[urlConnection.getContentLength()]; inputStream.read(contentRaw); String content = new String(contentRaw); //System.out.println(content); try { File file = new File("Results.xml"); FileWriter fileWriter = new FileWriter(file); fileWriter.write(content); //fileWriter.write("a test"); fileWriter.flush(); fileWriter.close(); } catch (IOException e) { System.out.println(e); } DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); try { //System.out.println("here"); //Using factory get an instance of document builder DocumentBuilder db = dbf.newDocumentBuilder(); //parse using builder to get DOM representation of the XML file Document dom = db.parse("Results.xml"); Element docEle = (Element) dom.getDocumentElement(); //get a nodelist of elements NodeList nl = docEle.getElementsByTagName("d:Url"); if (nl != null && nl.getLength() > 0) { for (int i = 0; i < nl.getLength(); i++) { //get the employee element Element el = (Element) nl.item(i); // System.out.println("here"); System.out.println(el.getTextContent()); //get the Employee object //Employee e = getEmployee(el); //add it to list //myEmpls.add(e); } } NodeList n2 = docEle.getElementsByTagName("d:Title"); if (n2 != null && n2.getLength() > 0) { for (int i = 0; i < n2.getLength(); i++) { //get the employee element Element e2 = (Element) n2.item(i); // System.out.println("here"); System.out.println(e2.getTextContent()); //get the Employee object //Employee e = getEmployee(el); //add it to list //myEmpls.add(e); } } NodeList n3 = docEle.getElementsByTagName("d:Description"); if (n3 != null && n3.getLength() > 0) { for (int i = 0; i < n3.getLength(); i++) { //get the employee element Element e3 = (Element) n3.item(i); // System.out.println("here"); System.out.println(e3.getTextContent()); //get the Employee object //Employee e = getEmployee(el); //add it to list //myEmpls.add(e); } } } catch (SAXException se) { se.printStackTrace(); } catch (ParserConfigurationException pe) { pe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IOException e) { System.out.println(e); } //The content string is the xml/json output from Bing. }
From source file:MainClass.java
public static void main(String args[]) { DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); builderFactory.setValidating(true); // and validating parser feaures builderFactory.setIgnoringElementContentWhitespace(true); DocumentBuilder builder = null; try {/*from w w w .j a v a 2 s . c o m*/ builder = builderFactory.newDocumentBuilder(); // Create the parser } catch (ParserConfigurationException e) { e.printStackTrace(); } Document xmlDoc = null; try { xmlDoc = builder.parse(new InputSource(new StringReader(xmlString))); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } DocumentType doctype = xmlDoc.getDoctype(); if (doctype == null) { System.out.println("DOCTYPE is null"); } else { System.out.println("DOCTYPE node:\n" + doctype.getInternalSubset()); } System.out.println("\nDocument body contents are:"); listNodes(xmlDoc.getDocumentElement(), ""); // Root element & children }
From source file:TreeDumper2.java
static public void main(String[] arg) { String filename = null;/* w w w . java 2 s. co m*/ boolean validate = false; if (arg.length == 1) { filename = arg[0]; } else if (arg.length == 2) { if (!arg[0].equals("-v")) usage(); validate = true; filename = arg[1]; } else { usage(); } DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(validate); dbf.setNamespaceAware(true); dbf.setIgnoringElementContentWhitespace(true); // Parse the input to produce a parse tree with its root // in the form of a Document object Document doc = null; try { DocumentBuilder builder = dbf.newDocumentBuilder(); builder.setErrorHandler(new MyErrorHandler()); InputSource is = new InputSource(filename); doc = builder.parse(is); } catch (SAXException e) { System.exit(1); } catch (ParserConfigurationException e) { System.err.println(e); System.exit(1); } catch (IOException e) { System.err.println(e); System.exit(1); } // Use a TreeDumper to list the tree TreeDumper2 td = new TreeDumper2(); td.dump(doc); }
From source file:marytts.tools.analysis.CopySynthesis.java
/** * @param args/*from w w w.j av a2s .c o m*/ */ public static void main(String[] args) throws Exception { String wavFilename = null; String labFilename = null; String pitchFilename = null; String textFilename = null; String locale = System.getProperty("locale"); if (locale == null) { throw new IllegalArgumentException("No locale given (-Dlocale=...)"); } for (String arg : args) { if (arg.endsWith(".txt")) textFilename = arg; else if (arg.endsWith(".wav")) wavFilename = arg; else if (arg.endsWith(".ptc")) pitchFilename = arg; else if (arg.endsWith(".lab")) labFilename = arg; else throw new IllegalArgumentException("Don't know how to treat argument: " + arg); } // The intonation contour double[] contour = null; double frameShiftTime = -1; if (pitchFilename == null) { // need to create pitch contour from wav file if (wavFilename == null) { throw new IllegalArgumentException("Need either a pitch file or a wav file"); } AudioInputStream ais = AudioSystem.getAudioInputStream(new File(wavFilename)); AudioDoubleDataSource audio = new AudioDoubleDataSource(ais); PitchFileHeader params = new PitchFileHeader(); params.fs = (int) ais.getFormat().getSampleRate(); F0TrackerAutocorrelationHeuristic tracker = new F0TrackerAutocorrelationHeuristic(params); tracker.pitchAnalyze(audio); frameShiftTime = tracker.getSkipSizeInSeconds(); contour = tracker.getF0Contour(); } else { // have a pitch file -- ignore any wav file PitchReaderWriter f0rw = new PitchReaderWriter(pitchFilename); if (f0rw.contour == null) { throw new NullPointerException("Cannot read f0 contour from " + pitchFilename); } contour = f0rw.contour; frameShiftTime = f0rw.header.skipSizeInSeconds; } assert contour != null; assert frameShiftTime > 0; // The ALLOPHONES data and labels if (labFilename == null) { throw new IllegalArgumentException("No label file given"); } if (textFilename == null) { throw new IllegalArgumentException("No text file given"); } MaryTranscriptionAligner aligner = new MaryTranscriptionAligner(); aligner.SetEnsureInitialBoundary(false); String labels = MaryTranscriptionAligner.readLabelFile(aligner.getEntrySeparator(), aligner.getEnsureInitialBoundary(), labFilename); MaryHttpClient mary = new MaryHttpClient(); String text = FileUtils.readFileToString(new File(textFilename), "ASCII"); ByteArrayOutputStream baos = new ByteArrayOutputStream(); mary.process(text, "TEXT", "ALLOPHONES", locale, null, null, baos); ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); docFactory.setNamespaceAware(true); DocumentBuilder builder = docFactory.newDocumentBuilder(); Document doc = builder.parse(bais); aligner.alignXmlTranscriptions(doc, labels); assert doc != null; // durations double[] endTimes = new LabelfileDoubleDataSource(new File(labFilename)).getAllData(); assert endTimes.length == labels.split(Pattern.quote(aligner.getEntrySeparator())).length; // Now add durations and f0 targets to document double prevEnd = 0; NodeIterator ni = MaryDomUtils.createNodeIterator(doc, MaryXML.PHONE, MaryXML.BOUNDARY); for (int i = 0; i < endTimes.length; i++) { Element e = (Element) ni.nextNode(); if (e == null) throw new IllegalStateException("More durations than elements -- this should not happen!"); double durInSeconds = endTimes[i] - prevEnd; int durInMillis = (int) (1000 * durInSeconds); if (e.getTagName().equals(MaryXML.PHONE)) { e.setAttribute("d", String.valueOf(durInMillis)); e.setAttribute("end", new Formatter(Locale.US).format("%.3f", endTimes[i]).toString()); // f0 targets at beginning, mid, and end of phone StringBuilder f0String = new StringBuilder(); double startF0 = getF0(contour, frameShiftTime, prevEnd); if (startF0 != 0 && !Double.isNaN(startF0)) { f0String.append("(0,").append((int) startF0).append(")"); } double midF0 = getF0(contour, frameShiftTime, prevEnd + 0.5 * durInSeconds); if (midF0 != 0 && !Double.isNaN(midF0)) { f0String.append("(50,").append((int) midF0).append(")"); } double endF0 = getF0(contour, frameShiftTime, endTimes[i]); if (endF0 != 0 && !Double.isNaN(endF0)) { f0String.append("(100,").append((int) endF0).append(")"); } if (f0String.length() > 0) { e.setAttribute("f0", f0String.toString()); } } else { // boundary e.setAttribute("duration", String.valueOf(durInMillis)); } prevEnd = endTimes[i]; } if (ni.nextNode() != null) { throw new IllegalStateException("More elements than durations -- this should not happen!"); } // TODO: add pitch values String acoustparams = DomUtils.document2String(doc); System.out.println("ACOUSTPARAMS:"); System.out.println(acoustparams); }
From source file:ValidateLicenseHeaders.java
/** * ValidateLicenseHeaders jboss-src-root * /* w ww . j a v a2s. c o m*/ * @param args */ public static void main(String[] args) throws Exception { if (args.length == 0 || args[0].startsWith("-h")) { log.info("Usage: ValidateLicenseHeaders [-addheader] jboss-src-root"); System.exit(1); } int rootArg = 0; if (args.length == 2) { if (args[0].startsWith("-add")) addDefaultHeader = true; else { log.severe("Uknown argument: " + args[0]); log.info("Usage: ValidateLicenseHeaders [-addheader] jboss-src-root"); System.exit(1); } rootArg = 1; } File jbossSrcRoot = new File(args[rootArg]); if (jbossSrcRoot.exists() == false) { log.info("Src root does not exist, check " + jbossSrcRoot.getAbsolutePath()); System.exit(1); } URL u = Thread.currentThread().getContextClassLoader() .getResource("META-INF/services/javax.xml.parsers.DocumentBuilderFactory"); System.err.println(u); // Load the valid copyright statements for the licenses File licenseInfo = new File(jbossSrcRoot, "varia/src/etc/license-info.xml"); if (licenseInfo.exists() == false) { log.severe("Failed to find the varia/src/etc/license-info.xml under the src root"); System.exit(1); } DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder db = factory.newDocumentBuilder(); Document doc = db.parse(licenseInfo); NodeList licenses = doc.getElementsByTagName("license"); for (int i = 0; i < licenses.getLength(); i++) { Element license = (Element) licenses.item(i); String key = license.getAttribute("id"); ArrayList headers = new ArrayList(); licenseHeaders.put(key, headers); NodeList copyrights = license.getElementsByTagName("terms-header"); for (int j = 0; j < copyrights.getLength(); j++) { Element copyright = (Element) copyrights.item(j); copyright.normalize(); String id = copyright.getAttribute("id"); // The id will be blank if there is no id attribute if (id.length() == 0) continue; String text = getElementContent(copyright); if (text == null) continue; // Replace all duplicate whitespace and '*' with a single space text = text.replaceAll("[\\s*]+", " "); if (text.length() == 1) continue; text = text.toLowerCase().trim(); // Replace any copyright date0-date1,date2 with copyright ... text = text.replaceAll(COPYRIGHT_REGEX, "..."); LicenseHeader lh = new LicenseHeader(id, text); headers.add(lh); } } log.fine(licenseHeaders.toString()); File[] files = jbossSrcRoot.listFiles(dotJavaFilter); log.info("Root files count: " + files.length); processSourceFiles(files, 0); log.info("Processed " + totalCount); log.info("Updated jboss headers: " + jbossCount); // Files with no headers details log.info("Files with no headers: " + noheaders.size()); FileWriter fw = new FileWriter("NoHeaders.txt"); for (Iterator iter = noheaders.iterator(); iter.hasNext();) { File f = (File) iter.next(); fw.write(f.getAbsolutePath()); fw.write('\n'); } fw.close(); // Files with unknown headers details log.info("Files with invalid headers: " + invalidheaders.size()); fw = new FileWriter("InvalidHeaders.txt"); for (Iterator iter = invalidheaders.iterator(); iter.hasNext();) { File f = (File) iter.next(); fw.write(f.getAbsolutePath()); fw.write('\n'); } fw.close(); // License usage summary log.info("Creating HeadersSummary.txt"); fw = new FileWriter("HeadersSummary.txt"); for (Iterator iter = licenseHeaders.entrySet().iterator(); iter.hasNext();) { Map.Entry entry = (Map.Entry) iter.next(); String key = (String) entry.getKey(); fw.write("+++ License type=" + key); fw.write('\n'); List list = (List) entry.getValue(); Iterator jiter = list.iterator(); while (jiter.hasNext()) { LicenseHeader lh = (LicenseHeader) jiter.next(); fw.write('\t'); fw.write(lh.id); fw.write(", count="); fw.write("" + lh.count); fw.write('\n'); } } fw.close(); }
From source file:AwsConsoleApp.java
public static void main(String[] args) throws Exception { System.out.println("==========================================="); System.out.println("Welcome to the AWS VPN connection creator"); System.out.println("==========================================="); init();/*w ww . j a va 2 s .com*/ List<String> CIDRblocks = new ArrayList<String>(); String vpnType = null; String vpnGatewayId = null; String customerGatewayId = null; String customerGatewayInfoPath = null; String routes = null; options.addOption("h", "help", false, "show help."); options.addOption("vt", "vpntype", true, "Set vpn tunnel type e.g. (ipec.1)"); options.addOption("vgw", "vpnGatewayId", true, "Set AWS VPN Gateway ID e.g. (vgw-eca54d85)"); options.addOption("cgw", "customerGatewayId", true, "Set AWS Customer Gateway ID e.g. (cgw-c16e87a8)"); options.addOption("r", "staticroutes", true, "Set static routes e.g. cutomer subnet 10.77.77.0/24"); options.addOption("vi", "vpninfo", true, "path to vpn info file c:\\temp\\customerGatewayInfo.xml"); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; // Parse command line options try { cmd = parser.parse(options, args); if (cmd.hasOption("h")) help(); if (cmd.hasOption("vt")) { log.log(Level.INFO, "Using cli argument -vt=" + cmd.getOptionValue("vt")); vpnType = cmd.getOptionValue("vt"); // Whatever you want to do with the setting goes here } else { log.log(Level.SEVERE, "Missing vt option"); help(); } if (cmd.hasOption("vgw")) { log.log(Level.INFO, "Using cli argument -vgw=" + cmd.getOptionValue("vgw")); vpnGatewayId = cmd.getOptionValue("vgw"); } else { log.log(Level.SEVERE, "Missing vgw option"); help(); } if (cmd.hasOption("cgw")) { log.log(Level.INFO, "Using cli argument -cgw=" + cmd.getOptionValue("cgw")); customerGatewayId = cmd.getOptionValue("cgw"); } else { log.log(Level.SEVERE, "Missing cgw option"); help(); } if (cmd.hasOption("r")) { log.log(Level.INFO, "Using cli argument -r=" + cmd.getOptionValue("r")); routes = cmd.getOptionValue("r"); String[] routeItems = routes.split(","); CIDRblocks = Arrays.asList(routeItems); } else { log.log(Level.SEVERE, "Missing r option"); help(); } if (cmd.hasOption("vi")) { log.log(Level.INFO, "Using cli argument -vi=" + cmd.getOptionValue("vi")); customerGatewayInfoPath = cmd.getOptionValue("vi"); } else { log.log(Level.SEVERE, "Missing vi option"); help(); } } catch (ParseException e) { log.log(Level.SEVERE, "Failed to parse comand line properties", e); help(); } /* * Amazon VPC * Create and delete VPN tunnel to customer VPN hardware */ try { //String vpnType = "ipsec.1"; //String vpnGatewayId = "vgw-eca54d85"; //String customerGatewayId = "cgw-c16e87a8"; //List<String> CIDRblocks = new ArrayList<String>(); //CIDRblocks.add("10.77.77.0/24"); //CIDRblocks.add("172.16.1.0/24"); //CIDRblocks.add("172.18.1.0/24"); //CIDRblocks.add("10.66.66.0/24"); //CIDRblocks.add("10.8.1.0/24"); //String customerGatewayInfoPath = "c:\\temp\\customerGatewayInfo.xml"; Boolean staticRoutesOnly = true; List<String> connectionIds = new ArrayList<String>(); List<String> connectionIdList = new ArrayList<String>(); connectionIdList = vpnExists(connectionIds); if (connectionIdList.size() == 0) { CreateVpnConnectionRequest vpnReq = new CreateVpnConnectionRequest(vpnType, customerGatewayId, vpnGatewayId); CreateVpnConnectionResult vpnRes = new CreateVpnConnectionResult(); VpnConnectionOptionsSpecification vpnspec = new VpnConnectionOptionsSpecification(); vpnspec.setStaticRoutesOnly(staticRoutesOnly); vpnReq.setOptions(vpnspec); System.out.println("Creating VPN connection"); vpnRes = ec2.createVpnConnection(vpnReq); String vpnConnId = vpnRes.getVpnConnection().getVpnConnectionId(); String customerGatewayInfo = vpnRes.getVpnConnection().getCustomerGatewayConfiguration(); //System.out.println("Customer Gateway Info:" + customerGatewayInfo); // Write Customer Gateway Info to file System.out.println("Writing Customer Gateway Info to file:" + customerGatewayInfoPath); try (PrintStream out = new PrintStream(new FileOutputStream(customerGatewayInfoPath))) { out.print(customerGatewayInfo); } System.out.println("Creating VPN routes"); for (String destCIDR : CIDRblocks) { CreateVpnConnectionRouteRequest routeReq = new CreateVpnConnectionRouteRequest(); CreateVpnConnectionRouteResult routeRes = new CreateVpnConnectionRouteResult(); routeReq.setDestinationCidrBlock(destCIDR); routeReq.setVpnConnectionId(vpnConnId); routeRes = ec2.createVpnConnectionRoute(routeReq); } // Parse XML file File file = new File(customerGatewayInfoPath); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document document = db.parse(customerGatewayInfoPath); XPathFactory xPathfactory = XPathFactory.newInstance(); XPath xpath = xPathfactory.newXPath(); XPathExpression exprGetipAddress = xpath .compile("/vpn_connection/ipsec_tunnel/vpn_gateway/tunnel_outside_address/ip_address"); NodeList vpnGateway = (NodeList) exprGetipAddress.evaluate(document, XPathConstants.NODESET); if (vpnGateway != null) { for (int i = 0; i < vpnGateway.getLength(); i++) { String vpnGatewayIP = vpnGateway.item(i).getTextContent(); System.out .println("AWS vpnGatewayIP for tunnel " + Integer.toString(i) + " " + vpnGatewayIP); } } System.out.println("=============================================="); XPathExpression exprGetKey = xpath.compile("/vpn_connection/ipsec_tunnel/ike/pre_shared_key"); NodeList presharedKeyList = (NodeList) exprGetKey.evaluate(document, XPathConstants.NODESET); if (presharedKeyList != null) { for (int i = 0; i < presharedKeyList.getLength(); i++) { String pre_shared_key = presharedKeyList.item(i).getTextContent(); System.out.println( "AWS pre_shared_key for tunnel " + Integer.toString(i) + " " + pre_shared_key); } } System.out.println("Creating VPN creation completed!"); } else { boolean yn; Scanner scan = new Scanner(System.in); System.out.println("Enter yes or no to delete VPN connection: "); String input = scan.next(); String answer = input.trim().toLowerCase(); while (true) { if (answer.equals("yes")) { yn = true; break; } else if (answer.equals("no")) { yn = false; System.exit(0); } else { System.out.println("Sorry, I didn't catch that. Please answer yes/no"); } } // Delete all existing VPN connections System.out.println("Deleting AWS VPN connection(s)"); for (String vpnConID : connectionIdList) { DeleteVpnConnectionResult delVPNres = new DeleteVpnConnectionResult(); DeleteVpnConnectionRequest delVPNreq = new DeleteVpnConnectionRequest(); delVPNreq.setVpnConnectionId(vpnConID); delVPNres = ec2.deleteVpnConnection(delVPNreq); System.out.println("Successfully deleted AWS VPN conntion: " + vpnConID); } } } catch (AmazonServiceException ase) { System.out.println("Caught Exception: " + ase.getMessage()); System.out.println("Reponse Status Code: " + ase.getStatusCode()); System.out.println("Error Code: " + ase.getErrorCode()); System.out.println("Request ID: " + ase.getRequestId()); } }
From source file:com.crawler.app.run.CrawlSiteController.java
public static void main(String[] args) throws Exception { logger.info("Start...: "); /*/*from w w w. j ava 2 s. c o m*/ * if (args.length != 2) { logger.info("Needed parameters: "); * logger.info * ("\t rootFolder (it will contain intermediate crawl data)"); * logger.info("\t numberOfCralwers (number of concurrent threads)"); * return; } */ /* * crawlStorageFolder is a folder where intermediate crawl data is * stored. */ String crawlStorageFolder = "D:\\/Java\\/storage";//"/crawler4j/storage";// args[0]; /* * numberOfCrawlers shows the number of concurrent threads that should * be initiated for crawling. */ // int numberOfCrawlers = Integer.parseInt(args[1]); int numberOfCrawlers = 1; CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorageFolder); /* * Be polite: Make sure that we don't send more than 1 request per * second (1000 milliseconds between requests). */ config.setPolitenessDelay(1000); // config.setFollowRedirects(false); /* * You can set the maximum crawl depth here. The default value is -1 for * unlimited depth */ config.setMaxDepthOfCrawling(1);// ( use -1 for unlimited depth ) /* * You can set the maximum number of pages to crawl. The default value * is -1 for unlimited number of pages */ config.setMaxPagesToFetch(-1);// ( use -1 for unlimited pages ) /** * Do you want crawler4j to crawl also binary data ? example: the * contents of pdf, or the metadata of images etc */ config.setIncludeBinaryContentInCrawling(false); /* * Do you need to set a proxy? If so, you can use: * config.setProxyHost("proxyserver.example.com"); * config.setProxyPort(8080); * * If your proxy also needs authentication: * config.setProxyUsername(username); config.getProxyPassword(password); */ /* * This config parameter can be used to set your crawl to be resumable * (meaning that you can resume the crawl from a previously * interrupted/crashed crawl). Note: if you enable resuming feature and * want to start a fresh crawl, you need to delete the contents of * rootFolder manually. */ config.setResumableCrawling(false); /* * Overwrite user ddagent */ config.setUserAgentString("Crawler"); /* * Instantiate the controller for this crawl. */ PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); // by me robotstxtConfig.setEnabled(false); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); // by me CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); /* * For each crawl, you need to add some seed urls. These are the first * URLs that are fetched and then the crawler starts following links * which are found in these pages */ // controller.addSeed("http://careerbuilder.vn/"); try { String tag_size = "site102"; int sizeIDXML = -1; String provinceYESNO, linkCrawlerBegin, linkCrawlerPage; int pageNumberBegin = -1, pageNumberEnd = -1, pageLoopInit = -1, pageLoop = -1; File fXmlFile = new File(pathXmlFile); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); org.w3c.dom.Document doc = dBuilder.parse(fXmlFile); org.w3c.dom.NodeList nList = doc.getElementsByTagName(tag_size); org.w3c.dom.Node nNode = nList.item(0); if (nNode.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) { org.w3c.dom.Element eElement = (org.w3c.dom.Element) nNode; sizeIDXML = Integer.parseInt(eElement.getAttribute("id")); String pageDefine = eElement.getElementsByTagName("pageDefine").item(0).getTextContent(); // read config ReadConfigPageNumberEnd(eElement); // if define then get define value to using if (!pageDefine.isEmpty() && pageDefine.toUpperCase().equals("YES")) { org.w3c.dom.NodeList nListOnewebsite = eElement.getElementsByTagName("website"); org.w3c.dom.Element eElementOnewebsite = (org.w3c.dom.Element) nListOnewebsite.item(0); linkCrawlerBegin = eElementOnewebsite.getElementsByTagName("linkCrawlerBegin").item(0) .getTextContent(); linkCrawlerPage = eElementOnewebsite.getElementsByTagName("linkCrawlerPage").item(0) .getTextContent(); int pageNumberTotal = Integer.parseInt( eElementOnewebsite.getElementsByTagName("pageNumberTotal").item(0).getTextContent()); pageNumberBegin = Integer.parseInt( eElementOnewebsite.getElementsByTagName("pageNumberBegin").item(0).getTextContent()); pageLoopInit = Integer.parseInt( eElementOnewebsite.getElementsByTagName("pageLoopInit").item(0).getTextContent()); pageLoop = Integer .parseInt(eElementOnewebsite.getElementsByTagName("pageLoop").item(0).getTextContent()); if (!linkCrawlerBegin.isEmpty()) { controller.addSeed(linkCrawlerBegin); //pageNumberEnd = getPageNumberEnd(linkCrawlerBegin); if (pageNumberTotal > 1) { int i = 0; for (; pageNumberTotal >= pageNumberBegin; pageNumberBegin++) { String convertlinkCrawlerPage = linkCrawlerPage.replace("%s", String.valueOf(pageLoopInit)); controller.addSeed(convertlinkCrawlerPage); pageLoopInit += pageLoop; i++; System.out.println(i); } } } } else { provinceYESNO = eElement.getElementsByTagName("provinceYESNO").item(0).getTextContent(); if (!provinceYESNO.isEmpty() && provinceYESNO.toUpperCase().equals("YES")) { // have sevent province org.w3c.dom.NodeList nListProvince = eElement.getElementsByTagName("province"); for (int index = 0; index < nListProvince.getLength(); index++) { org.w3c.dom.Element eElementProvince = (org.w3c.dom.Element) nListProvince.item(index); linkCrawlerBegin = eElementProvince.getElementsByTagName("linkCrawlerBegin").item(0) .getTextContent(); linkCrawlerPage = eElementProvince.getElementsByTagName("linkCrawlerPage").item(0) .getTextContent(); if (!eElementProvince.getElementsByTagName("pageNumberBegin").item(0).getTextContent() .isEmpty()) { pageNumberBegin = Integer.parseInt(eElementProvince .getElementsByTagName("pageNumberBegin").item(0).getTextContent()); } if (!eElementProvince.getElementsByTagName("pageLoopInit").item(0).getTextContent() .isEmpty()) { pageLoopInit = Integer.parseInt(eElementProvince .getElementsByTagName("pageLoopInit").item(0).getTextContent()); } if (!eElementProvince.getElementsByTagName("pageLoop").item(0).getTextContent() .isEmpty()) { pageLoop = Integer.parseInt( eElementProvince.getElementsByTagName("pageLoop").item(0).getTextContent()); } if (!linkCrawlerBegin.isEmpty()) { controller.addSeed(linkCrawlerBegin); pageNumberEnd = getPageNumberEnd(linkCrawlerBegin); if (pageNumberEnd > 1) { for (; pageNumberBegin <= pageNumberEnd; pageNumberBegin++) { String convertlinkCrawlerPage = linkCrawlerPage.replace("%s", String.valueOf(pageLoopInit)); controller.addSeed(convertlinkCrawlerPage); pageLoopInit += pageLoop; } } } } } else if (!provinceYESNO.isEmpty() && provinceYESNO.toUpperCase().equals("NO")) { // don't have sevent province org.w3c.dom.NodeList nListOnewebsite = eElement.getElementsByTagName("website"); org.w3c.dom.Element eElementOnewebsite = (org.w3c.dom.Element) nListOnewebsite.item(0); // read config of pagenumber end linkCrawlerBegin = eElementOnewebsite.getElementsByTagName("linkCrawlerBegin").item(0) .getTextContent(); linkCrawlerPage = eElementOnewebsite.getElementsByTagName("linkCrawlerPage").item(0) .getTextContent(); if (!eElementOnewebsite.getElementsByTagName("pageNumberBegin").item(0).getTextContent() .isEmpty()) { pageNumberBegin = Integer.parseInt(eElementOnewebsite .getElementsByTagName("pageNumberBegin").item(0).getTextContent()); } if (!eElementOnewebsite.getElementsByTagName("pageLoopInit").item(0).getTextContent() .isEmpty()) { pageLoopInit = Integer.parseInt(eElementOnewebsite.getElementsByTagName("pageLoopInit") .item(0).getTextContent()); } if (!eElementOnewebsite.getElementsByTagName("pageLoop").item(0).getTextContent() .isEmpty()) { pageLoop = Integer.parseInt( eElementOnewebsite.getElementsByTagName("pageLoop").item(0).getTextContent()); } if (!linkCrawlerBegin.isEmpty()) { controller.addSeed(linkCrawlerBegin); pageNumberEnd = getPageNumberEnd(linkCrawlerBegin); if (pageNumberEnd > 1) { for (; pageNumberBegin <= pageNumberEnd; pageNumberBegin++) { String convertlinkCrawlerPage = linkCrawlerPage.replace("%s", String.valueOf(pageLoopInit)); controller.addSeed(convertlinkCrawlerPage); pageLoopInit += pageLoop; } } } } } CrawlSite.tag_size = tag_size; CrawlSite.siteIDXML = sizeIDXML; controller.start(CrawlSite.class, numberOfCrawlers); } } catch (Exception ex) { System.out.print("can't read config xml, review xml file !!"); System.out.print(ex.getMessage()); } }
From source file:com.occamlab.te.parsers.ImageParser.java
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Parameters: xml_url image_url"); return;//from ww w . jav a 2s. co m } java.net.URL xml_url; try { xml_url = new java.net.URL(args[0]); } catch (Exception e) { jlogger.log(Level.INFO, "Error building xmlurl, will prefix file://", e); xml_url = new java.net.URL("file://" + args[0]); } java.net.URL image_url; try { image_url = new java.net.URL(args[1]); } catch (Exception e) { jlogger.log(Level.INFO, "Error building xmlurl, will prefix file://", e); image_url = new java.net.URL("file://" + args[1]); } DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setNamespaceAware(true); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(xml_url.openStream()); // Element instruction = (Element) // doc.getElementsByTagNameNS("http://www.occamlab.com/te/parsers", // "ImageParser").item(0); Element instruction = (Element) doc.getDocumentElement(); PrintWriter logger = new PrintWriter(System.out); InputStream image_is = image_url.openConnection().getInputStream(); Document result = parse(image_is, instruction, logger); logger.flush(); if (result != null) { TransformerFactory tf = TransformerFactory.newInstance(); try { tf.setAttribute("http://saxon.sf.net/feature/strip-whitespace", "all"); } catch (IllegalArgumentException e) { jlogger.log(Level.INFO, "setAttribute(\"http://saxon.sf.net/feature/strip-whitespace\", \"all\");", e); } Transformer t = tf.newTransformer(); t.setOutputProperty(OutputKeys.INDENT, "yes"); t.transform(new DOMSource(result), new StreamResult(System.out)); } System.exit(0); }