<?xml version="1.0"?>
<configuration>
<configSections>
<sectionGroup name="userSettings" type="System.Configuration.UserSettingsGroup, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" >
<section name="WebCrawler.Properties.Settings" type="System.Configuration.ClientSettingsSection, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" allowExeDefinition="MachineToLocalUser" requirePermission="false" />
</sectionGroup>
</configSections>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.0"/>
</startup>
<userSettings>
<WebCrawler.Properties.Settings>
<setting name="LogPath" serializeAs="String">
<value>C:tempWebCrawlerErrorLog.txt</value>
</setting>
<setting name="XMLPath" serializeAs="String">
<value>C:tempWebCrawlerWebCrawlerLog.xml</value>
</setting>
<setting name="URLPath" serializeAs="String">
<value />
</setting>
<setting name="TrimTitle" serializeAs="String">
<value />
</setting>
</WebCrawler.Properties.Settings>
</userSettings>
</configuration>
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Diagnostics;
using System.Net;
using System.Web;
namespace WebCrawler
{
public partial class Form1 : Form
{
public static string sTrimTitle = "";
public static string sSourceURL = "";
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
if (Properties.Settings.Default.XMLPath != "")
{
txtXMLPath.Text = Properties.Settings.Default.XMLPath;
Directory.CreateDirectory(Path.GetDirectoryName(txtXMLPath.Text));
}
if (Properties.Settings.Default.LogPath != "")
{
txtLogPath.Text = Properties.Settings.Default.LogPath;
Directory.CreateDirectory(Path.GetDirectoryName(txtLogPath.Text));
}
if (Properties.Settings.Default.URLPath != "")
{
txtURL.Text = Properties.Settings.Default.URLPath;
}
if (Properties.Settings.Default.TrimTitle != "")
{
txtTrimTitle.Text = Properties.Settings.Default.TrimTitle;
sTrimTitle = Properties.Settings.Default.TrimTitle;
}
}
private void btnLogPathBrowse_Click(object sender, EventArgs e)
{
DialogResult result = folderBrowserDialog1.ShowDialog();
if (result == DialogResult.OK)
{
txtLogPath.Text = folderBrowserDialog1.SelectedPath + "ErrorLog.txt";
Properties.Settings.Default.LogPath = txtLogPath.Text;
Properties.Settings.Default.Save();
//string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath);
//MessageBox.Show("Files found: " + files.Length.ToString(), "Message");
}
}
private void btnXMLPathBrowse_Click(object sender, EventArgs e)
{
DialogResult result = folderBrowserDialog1.ShowDialog();
if (result == DialogResult.OK)
{
txtXMLPath.Text = folderBrowserDialog1.SelectedPath + "WebCrawlerLog.xml";
Properties.Settings.Default.XMLPath = txtXMLPath.Text;
Properties.Settings.Default.Save();
//string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath);
//MessageBox.Show("Files found: " + files.Length.ToString(), "Message");
}
}
private void btnRunCrawler_Click(object sender, EventArgs e)
{
lvResults.Clear();
Properties.Settings.Default.URLPath = txtURL.Text;
Properties.Settings.Default.TrimTitle = txtTrimTitle.Text;
Properties.Settings.Default.Save();
sSourceURL = Properties.Settings.Default.URLPath.ToString();
sTrimTitle = txtTrimTitle.Text;
//
string fPath = txtXMLPath.Text;
//crawl and retrieve links
//
WebClient wc = new WebClient();
//
string sURL = "";
if (txtURL.Text.Contains("http://") | txtURL.Text.Contains("https://"))
{
sURL = txtURL.Text;
}
else
{
sURL = "http://" + txtURL.Text;
}
//
string htmlString = wc.DownloadString(sURL);
//
using (StreamWriter w = File.AppendText(fPath))
{
DirLog.Log(sURL, w);
}
//
LinkFinder.Find(htmlString);
lvResults.View = View.Details;
// Add columns
lvResults.Columns.Add("URL", -2, HorizontalAlignment.Left);
lvResults.Columns.Add("Type", -2, HorizontalAlignment.Left);
lvResults.Columns.Add("Extension", -2, HorizontalAlignment.Left);
lvResults.Columns.Add("Title", -2, HorizontalAlignment.Left);
string[] sResult;
string sURL1;
string sTitle;
string sType;
string sExtension;
foreach (LinkItem i in LinkFinder.Find(htmlString))
{
ListViewItem lvi = new ListViewItem();
sResult = i.ToString().Split('t');
sURL1 = sResult[0].Trim();
sTitle = sResult[1].Trim();
sType = sResult[2].Trim();
sExtension = sResult[3].Trim();
//MessageBox.Show(sURL);
lvi.Text = sURL1;
lvi.SubItems.Add(sType);
lvi.SubItems.Add(sExtension);
lvi.SubItems.Add(sTitle);
lvResults.Items.Add(lvi);
}
lvResults.AutoResizeColumn(0,
ColumnHeaderAutoResizeStyle.HeaderSize);
lvResults.AutoResizeColumn(1,
ColumnHeaderAutoResizeStyle.ColumnContent);
lvResults.AutoResizeColumn(2,
ColumnHeaderAutoResizeStyle.ColumnContent);
lvResults.AutoResizeColumn(3,
ColumnHeaderAutoResizeStyle.ColumnContent);
lblRowCount.Text = "Row Count: " + lvResults.Items.Count.ToString();
}
}
public struct LinkItem
{
public string Href;
public string Text;
public string Type;
public string Extension;
public override string ToString()
{
return Href + "nrt" + Text + "nrt" + Type + "nrt" + Extension;
}
}
static class LinkFinder
{
public static List<LinkItem> Find(string file)
{
string fErrorPath = Properties.Settings.Default.LogPath;
List<LinkItem> linkList = new List<LinkItem>();
//find the title of the page.
Match mTitle = Regex.Match(file, @"<title>(.*?)</title>", RegexOptions.Singleline);
//string page = mTitle.Value; //rnt rn
// shorten the page title
string pageTitle = XMLCleanString(mTitle.Value, 25);
//find all matches in file.
MatchCollection m1 = Regex.Matches(file, @"(<a.*?>.*?</a>)", RegexOptions.Singleline);
//loop through each match.
foreach (Match m in m1)
{
try
{
string value = m.Groups[1].Value;
LinkItem i = new LinkItem();
//get the href attribute.
Match m2 = Regex.Match(value, @"href=""(.*?)""", RegexOptions.Singleline);
if (Regex.Match(value, "<img.+?src=["'](.+?)["'].+?>", RegexOptions.IgnoreCase).Success)
{
i.Type = "Image";
}
else
{
i.Type = "Text";
}
//
if (m2.Success)
{
i.Href = m2.Groups[1].Value;
}
//
//remove inner tags from text.
//string t = Regex.Replace(value, @"s*<.*?>s*", "", RegexOptions.Singleline).Trim();
string t;
t = Regex.Replace(value, @"t|n|r|s*<.*?>s*", "", RegexOptions.Singleline).Trim();
//t = value.Trim();
i.Text = t;
#region Get Extension From URL
//MessageBox.Show(i.Href);
//MessageBox.Show(VirtualPathUtility.GetExtension(i.Href).ToString());
string[] sExt;
if (VirtualPathUtility.GetExtension(i.Href).ToString() != "")
{
if (VirtualPathUtility.GetExtension(i.Href).ToString().Contains('?'))
{
sExt = VirtualPathUtility.GetExtension(i.Href).ToString().Split('?');
i.Extension = sExt[0];
}
else
{
i.Extension = VirtualPathUtility.GetExtension(i.Href).ToString();
}
}
else
{
i.Extension = "/";
}
#endregion Get Extension From URL
//add to list.
linkList.Add(i);
//write item to xml file.
}
catch (Exception ex1)
{
}
}
//
try
{
//
XElement url = new XElement(XMLCleanString(Form1.sSourceURL, 250), from ll in linkList
select new XElement("LinkInfo"
, new XElement("URL", ll.Href)
, new XElement("Type", ll.Type)
, new XElement("Extension", ll.Extension)
, new XElement("Text", ll.Text)
));
url.Save(Properties.Settings.Default.XMLPath);
}
catch (Exception ex)
{
//Debug.WriteLine(ex.Message);
//
string eMsg = ex.ToString() + Environment.NewLine + "pageTitle: " + pageTitle;
using (StreamWriter sw = File.AppendText(fErrorPath))
{
DirLog.Log(eMsg, sw);
}
}
//
return linkList;
}
// Truncate Page Title
public static string XMLCleanString(string source, int nLength)
{
source = source.Trim();
source = Regex.Replace(source, @"t|n|r|s*<.*?>s*", "", RegexOptions.Singleline).Trim();
source = source.Replace("http://", "");
source = source.Replace("https://", "");
source = source.Replace("/", "_");
//MessageBox.Show(source);
//MessageBox.Show(Form1.sTrimTitle);
source = source.Replace(Form1.sTrimTitle, "");
//MessageBox.Show(source);
source = Regex.Replace(source, @"s+", "_");
source = source.Replace(@"'", "");
if (source.Length > nLength)
{
source = source.Substring(0, nLength);
source = source.Remove(source.Length - 1);
}
return source;
}
}
class DirLog
{
public static void Log(string logMessage, TextWriter w)
{
w.Write("rnLog Entry : ");
w.WriteLine("{0} {1}", DateTime.Now.ToLongTimeString(), DateTime.Now.ToLongTimeString());
w.WriteLine(" :");
w.WriteLine(" :{0}", logMessage);
w.WriteLine("--------------------------------------------------------------------------------");
}
public static void DumpLog(StreamReader r)
{
string line;
while ((line = r.ReadLine()) != null)
{
Console.WriteLine(line);
}
}
}
}
Originally Posted on August 12, 2013
Last Updated on October 26, 2015
Last Updated on October 26, 2015
All information on this site is shared with the intention to help. Before any source code or program is ran on a production (non-development) system it is suggested you test it and fully understand what it is doing not just what it appears it is doing. I accept no responsibility for any damage you may do with this code.

You must be logged in to post a comment.