Downloads a web page from the Internet and returns a string.
using System;
using System.IO;
using System.Net.Mail;
using System.Collections.Generic;
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
using System.Configuration;
using System.Globalization;
using System.Web;
using System.Web.Configuration;
using System.Threading;
using System.Reflection;
using System.Collections;
using System.Xml;
using System.Net;
using System.Web.Caching;
namespace BlogEngine.Core
{
/// <summary>
/// Utilities for the entire solution to use.
/// </summary>
public static class Utils
{
/// <summary>
/// Downloads a web page from the Internet and returns a string. .
/// </summary>
/// <param name="url">The URL to download from.</param>
/// <returns>The HTML or null if the URL isn't valid.</returns>
public static string DownloadWebPage(Uri url)
{
try
{
using (WebClient client = new WebClient())
{
client.UseDefaultCredentials = true;
client.Headers.Add(System.Net.HttpRequestHeader.UserAgent, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)");
using (StreamReader reader = new StreamReader(client.OpenRead(url)))
{
return reader.ReadToEnd();
}
}
}
catch (WebException)
{
return null;
}
}
private static XmlDocument LoadDocument(Uri url, Uri xmlUrl)
{
XmlDocument doc = new XmlDocument();
try
{
if (url.IsAbsoluteUri)
{
doc.Load(xmlUrl.ToString());
}
else
{
string absoluteUrl = null;
if (!url.ToString().StartsWith("/"))
absoluteUrl = (url + xmlUrl.ToString());
else
absoluteUrl = url.Scheme + "://" + url.Authority + xmlUrl;
doc.Load(absoluteUrl);
}
}
catch (Exception)
{
return null;
}
return doc;
}
private const string PATTERN = "<head.*<link( [^>]*title=\"{0}\"[^>]*)>.*</head>";
private static readonly Regex HREF = new Regex("href=\"(.*)\"", RegexOptions.IgnoreCase | RegexOptions.Compiled);
/// <summary>
/// Finds semantic links in a given HTML document.
/// </summary>
/// <param name="type">The type of link. Could be foaf, apml or sioc.</param>
/// <param name="html">The HTML to look through.</param>
/// <returns></returns>
public static List<Uri> FindLinks(string type, string html)
{
MatchCollection matches = Regex.Matches(html, string.Format(PATTERN, type), RegexOptions.IgnoreCase | RegexOptions.Singleline);
List<Uri> urls = new List<Uri>();
foreach (Match match in matches)
{
if (match.Groups.Count == 2)
{
string link = match.Groups[1].Value;
Match hrefMatch = HREF.Match(link);
if (hrefMatch.Groups.Count == 2)
{
Uri url;
string value = hrefMatch.Groups[1].Value;
if (Uri.TryCreate(value, UriKind.Absolute, out url))
{
urls.Add(url);
}
}
}
}
return urls;
}
}
}
Related examples in the same category