Skip to content
David Kittell
David Kittell

Application & System: Development / Integration / Orchestration

  • Services
    • Application Development
    • Online Application Integration
  • Code
  • Online Tools
  • Tech Support
David Kittell

Application & System: Development / Integration / Orchestration

Web Crawler – C#

Posted on August 12, 2013October 26, 2015 By David Kittell

WebCrawlerApplication

<?xml version="1.0"?>
<configuration>
	<configSections>
		<sectionGroup name="userSettings" type="System.Configuration.UserSettingsGroup, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" >
			<section name="WebCrawler.Properties.Settings" type="System.Configuration.ClientSettingsSection, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" allowExeDefinition="MachineToLocalUser" requirePermission="false" />
		</sectionGroup>
	</configSections>
	<startup>
		<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.0"/>
	</startup>
	<userSettings>
		<WebCrawler.Properties.Settings>
   <setting name="LogPath" serializeAs="String">
    <value>C:tempWebCrawlerErrorLog.txt</value>
   </setting>
   <setting name="XMLPath" serializeAs="String">
    <value>C:tempWebCrawlerWebCrawlerLog.xml</value>
   </setting>
   <setting name="URLPath" serializeAs="String">
    <value />
   </setting>
   <setting name="TrimTitle" serializeAs="String">
    <value />
   </setting>
  </WebCrawler.Properties.Settings>
	</userSettings>
</configuration>
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Diagnostics;
using System.Net;
using System.Web;

namespace WebCrawler
{
	public partial class Form1 : Form
	{

		public static string sTrimTitle = "";
		public static string sSourceURL = "";

		public Form1()
		{
			InitializeComponent();
		}

		private void Form1_Load(object sender, EventArgs e)
		{
			if (Properties.Settings.Default.XMLPath != "")
			{
				txtXMLPath.Text = Properties.Settings.Default.XMLPath;
				Directory.CreateDirectory(Path.GetDirectoryName(txtXMLPath.Text));
			}
			if (Properties.Settings.Default.LogPath != "")
			{
				txtLogPath.Text = Properties.Settings.Default.LogPath;
				Directory.CreateDirectory(Path.GetDirectoryName(txtLogPath.Text));
			}
			if (Properties.Settings.Default.URLPath != "")
			{
				txtURL.Text = Properties.Settings.Default.URLPath;
			}
			if (Properties.Settings.Default.TrimTitle != "")
			{
				txtTrimTitle.Text = Properties.Settings.Default.TrimTitle;
				sTrimTitle = Properties.Settings.Default.TrimTitle;
			}
		}

		private void btnLogPathBrowse_Click(object sender, EventArgs e)
		{
			DialogResult result = folderBrowserDialog1.ShowDialog();
			if (result == DialogResult.OK)
			{
				txtLogPath.Text = folderBrowserDialog1.SelectedPath + "ErrorLog.txt";
				Properties.Settings.Default.LogPath = txtLogPath.Text;
				Properties.Settings.Default.Save();
				//string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath);
				//MessageBox.Show("Files found: " + files.Length.ToString(), "Message");
			}
		}

		private void btnXMLPathBrowse_Click(object sender, EventArgs e)
		{
			DialogResult result = folderBrowserDialog1.ShowDialog();
			if (result == DialogResult.OK)
			{
				txtXMLPath.Text = folderBrowserDialog1.SelectedPath + "WebCrawlerLog.xml";
				Properties.Settings.Default.XMLPath = txtXMLPath.Text;
				Properties.Settings.Default.Save();
				//string[] files = Directory.GetFiles(folderBrowserDialog1.SelectedPath);
				//MessageBox.Show("Files found: " + files.Length.ToString(), "Message");
			}
		}

		private void btnRunCrawler_Click(object sender, EventArgs e)
		{
			lvResults.Clear();

			Properties.Settings.Default.URLPath = txtURL.Text;
			Properties.Settings.Default.TrimTitle = txtTrimTitle.Text;
			Properties.Settings.Default.Save();

			sSourceURL = Properties.Settings.Default.URLPath.ToString();

			sTrimTitle = txtTrimTitle.Text;

			//
			string fPath = txtXMLPath.Text;

			//crawl and retrieve links
			//
			WebClient wc = new WebClient();
			//

			string sURL = "";
			if (txtURL.Text.Contains("http://") | txtURL.Text.Contains("https://"))
			{
				sURL = txtURL.Text;
			}
			else
			{
				sURL = "http://" + txtURL.Text;
			}

			//
			string htmlString = wc.DownloadString(sURL);
			//
			using (StreamWriter w = File.AppendText(fPath))
			{
				DirLog.Log(sURL, w);
			}
			//
			LinkFinder.Find(htmlString);

			lvResults.View = View.Details;
			// Add columns
			lvResults.Columns.Add("URL", -2, HorizontalAlignment.Left);
			lvResults.Columns.Add("Type", -2, HorizontalAlignment.Left);
			lvResults.Columns.Add("Extension", -2, HorizontalAlignment.Left);
			lvResults.Columns.Add("Title", -2, HorizontalAlignment.Left);

			string[] sResult;
			string sURL1;
			string sTitle;
			string sType;
			string sExtension;

			foreach (LinkItem i in LinkFinder.Find(htmlString))
			{
				ListViewItem lvi = new ListViewItem();
				sResult = i.ToString().Split('t');
				sURL1 = sResult[0].Trim();
				sTitle = sResult[1].Trim();
				sType = sResult[2].Trim();
				sExtension = sResult[3].Trim();
				//MessageBox.Show(sURL);
				lvi.Text = sURL1;
				lvi.SubItems.Add(sType);
				lvi.SubItems.Add(sExtension);
				lvi.SubItems.Add(sTitle);
				lvResults.Items.Add(lvi);
			}

			lvResults.AutoResizeColumn(0,
				ColumnHeaderAutoResizeStyle.HeaderSize);
			lvResults.AutoResizeColumn(1,
			ColumnHeaderAutoResizeStyle.ColumnContent);
			lvResults.AutoResizeColumn(2,
			ColumnHeaderAutoResizeStyle.ColumnContent);
			lvResults.AutoResizeColumn(3,
			ColumnHeaderAutoResizeStyle.ColumnContent);

			lblRowCount.Text = "Row Count: " + lvResults.Items.Count.ToString();
		}

	}

	public struct LinkItem
	{
		public string Href;
		public string Text;
		public string Type;
		public string Extension;

		public override string ToString()
		{
			return Href + "nrt" + Text + "nrt" + Type + "nrt" + Extension;
		}
	}

	static class LinkFinder
	{
		public static List<LinkItem> Find(string file)
		{
			string fErrorPath = Properties.Settings.Default.LogPath;

			List<LinkItem> linkList = new List<LinkItem>();

			//find the title of the page.
			Match mTitle = Regex.Match(file, @"<title>(.*?)</title>", RegexOptions.Singleline);
			//string page = mTitle.Value; //rnt rn

			// shorten the page title
			string pageTitle = XMLCleanString(mTitle.Value, 25);
			//find all matches in file.
			MatchCollection m1 = Regex.Matches(file, @"(<a.*?>.*?</a>)", RegexOptions.Singleline);

			//loop through each match.
			foreach (Match m in m1)
			{
				try
				{
					string value = m.Groups[1].Value;
					LinkItem i = new LinkItem();

					//get the href attribute.
					Match m2 = Regex.Match(value, @"href=""(.*?)""", RegexOptions.Singleline);

					if (Regex.Match(value, "<img.+?src=["'](.+?)["'].+?>", RegexOptions.IgnoreCase).Success)
					{
						i.Type = "Image";
					}
					else
					{
						i.Type = "Text";
					}

					//
					if (m2.Success)
					{
						i.Href = m2.Groups[1].Value;
					}
					//
					//remove inner tags from text.
					//string t = Regex.Replace(value, @"s*<.*?>s*", "", RegexOptions.Singleline).Trim();
					string t;
					t = Regex.Replace(value, @"t|n|r|s*<.*?>s*", "", RegexOptions.Singleline).Trim();

					//t = value.Trim();

					i.Text = t;

					#region Get Extension From URL
					//MessageBox.Show(i.Href);
					//MessageBox.Show(VirtualPathUtility.GetExtension(i.Href).ToString());
					string[] sExt;

					if (VirtualPathUtility.GetExtension(i.Href).ToString() != "")
					{
						if (VirtualPathUtility.GetExtension(i.Href).ToString().Contains('?'))
						{
							sExt = VirtualPathUtility.GetExtension(i.Href).ToString().Split('?');
							i.Extension = sExt[0];
						}
						else
						{
							i.Extension = VirtualPathUtility.GetExtension(i.Href).ToString();
						}

					}
					else
					{
						i.Extension = "/";
					}
					#endregion Get Extension From URL

					//add to list.
					linkList.Add(i);
					//write item to xml file.
				}
				catch (Exception ex1)
				{
				}

			}
			//
			try
			{
				//
				XElement url = new XElement(XMLCleanString(Form1.sSourceURL, 250), from ll in linkList
																				   select new XElement("LinkInfo"
																				   , new XElement("URL", ll.Href)
																				   , new XElement("Type", ll.Type)
																				   , new XElement("Extension", ll.Extension)
																				   , new XElement("Text", ll.Text)
																				   ));
				url.Save(Properties.Settings.Default.XMLPath);
			}
			catch (Exception ex)
			{
				//Debug.WriteLine(ex.Message);
				//
				string eMsg = ex.ToString() + Environment.NewLine + "pageTitle: " + pageTitle;

				using (StreamWriter sw = File.AppendText(fErrorPath))
				{
					DirLog.Log(eMsg, sw);
				}
			}
			//
			return linkList;
		}

		// Truncate Page Title
		public static string XMLCleanString(string source, int nLength)
		{
			source = source.Trim();
			source = Regex.Replace(source, @"t|n|r|s*<.*?>s*", "", RegexOptions.Singleline).Trim();
			source = source.Replace("http://", "");
			source = source.Replace("https://", "");
			source = source.Replace("/", "_");
			//MessageBox.Show(source);
			//MessageBox.Show(Form1.sTrimTitle);
			source = source.Replace(Form1.sTrimTitle, "");
			//MessageBox.Show(source);

			source = Regex.Replace(source, @"s+", "_");
			source = source.Replace(@"&#39;", "");

			if (source.Length > nLength)
			{
				source = source.Substring(0, nLength);
				source = source.Remove(source.Length - 1);
			}
			return source;
		}
	}

	class DirLog
	{
		public static void Log(string logMessage, TextWriter w)
		{
			w.Write("rnLog Entry : ");
			w.WriteLine("{0} {1}", DateTime.Now.ToLongTimeString(), DateTime.Now.ToLongTimeString());
			w.WriteLine("  :");
			w.WriteLine("  :{0}", logMessage);
			w.WriteLine("--------------------------------------------------------------------------------");
		}

		public static void DumpLog(StreamReader r)
		{
			string line;
			while ((line = r.ReadLine()) != null)
			{
				Console.WriteLine(line);
			}
		}
	}
}
Originally Posted on August 12, 2013
Last Updated on October 26, 2015
All information on this site is shared with the intention to help. Before any source code or program is ran on a production (non-development) system it is suggested you test it and fully understand what it is doing not just what it appears it is doing. I accept no responsibility for any damage you may do with this code.

Related

C# Project Code

Post navigation

Previous post
Next post

Related Posts

Mac OS X – Get Network Information

Posted on February 27, 2017May 2, 2017

Every once in a while you need to get some basic network information from your Mac or a Mac you are supporting, this script below will help get you some helpful information. #!/bin/sh clear sExternalMACALService="http://dns.kittell.net/macaltext.php?address=" # List all Network ports NetworkPorts=$(ifconfig -uv | grep ‘^[a-z0-9]’ | awk -F : ‘{print…

Read More

PowerShell – UNIX SED Equivalent – Change Text In File

Posted on March 3, 2016March 3, 2016

Unix SED command is very useful to make changes to a configuration file when you know what the default values are, below is a way to do a SED-like text change in PowerShell (Get-Content c:\temp\test.txt).replace(‘[MYID]’, ‘MyValue’) | Set-Content c:\temp\test.txt Example: TeamCity Build Agent Configuration file needs updated so it knows…

Read More

UNIX – Set SSH Banner

Posted on January 17, 2018May 4, 2025

# Switch to root user sudo -s echo -e "\033[01m\e[4mType your desired hostname for the server, followed by [ENTER]:\e[0m\033[0m" read hostname sudo hostnamectl set-hostname –static "$hostname" sudo hostnamectl set-hostname "$hostname" hostnamectl status companyname="Kittell.net" declare OSVer=$(cat /etc/redhat-release) declare sCPU=$(grep -c ^processor /proc/cpuinfo ) # echo "CPU: $sCPU" declare sRamGB=$(cat /proc/meminfo |…

Read More

Code

Top Posts & Pages

  • PowerShell - Rename Pictures to Image Taken
  • Front Page
  • C# - Start/Stop/Restart Services
  • MacPorts / HomeBrew - Rip CD tracks from terminal
  • PowerShell - Show File Extensions

Recent Posts

  • Javascript – Digital Clock with Style
  • BASH – Web Ping Log
  • BASH – Picture / Video File Name Manipulation
  • Mac OSX Terminal – Create SSH Key
  • Bash – Rename Picture

Top Posts

  • PowerShell - Rename Pictures to Image Taken
  • C# - Start/Stop/Restart Services
  • MacPorts / HomeBrew - Rip CD tracks from terminal
  • PowerShell - Show File Extensions
  • Open On Screen Keyboard (OSK)
  • SQLite - Auto-Increment / Auto Generate GUID
©2025 David Kittell | WordPress Theme by SuperbThemes
 

Loading Comments...
 

You must be logged in to post a comment.