Code:
open System
open System.Net
open Hyperz.SharpLeech.Engine
open Hyperz.SharpLeech.Engine.Html
open Hyperz.SharpLeech.Engine.Net
let getData url =
url
|> Http.Prepare
|> Http.Request
|> fun result ->
if result.HasError then result.Data
else Http.HandleRedirects(result, false).Data
let getUrls html sourceUrl =
let baseUrl =
new Uri(sourceUrl)
|> fun u -> u.Scheme + "://" + u.Host
new HtmlDocument()
|> fun doc -> doc.LoadHtml(html); doc
|> fun doc -> doc.DocumentNode.SelectNodes("//a")
|> Seq.map (fun node -> node.GetAttributeValue("href", ""))
|> Seq.map (fun url -> HttpUtility.HtmlDecode(url).Trim())
|> Seq.map (fun url ->
if url.StartsWith("http://") then url
elif url.StartsWith("https://") then url
elif url.StartsWith("/") then baseUrl + url
elif url.StartsWith("#") then ""
else baseUrl + "/" + url)
|> Seq.filter (fun url -> url.Length > 0)
let rec crawl url crawled = Async.Start(async {
let data = getData url
let urls =
getUrls data url
|> Seq.filter (fun u -> not(List.exists (fun itm -> itm = u) (crawled)))
do printfn "Crawling: %s\nFound: %i URL's" url (Seq.length urls)
for u in urls do crawl u (crawled @ [u]) })
(* ================================================ *)
(* START CRAWLING *)
(* ================================================ *)
let url = "http://thepiratebay.org/"
let rec memCleaner() =
(* Clean memory every 10 seconds *)
System.Threading.Thread.Sleep(10000)
GC.Collect()
memCleaner()
ServicePointManager.DefaultConnectionLimit <- 10
Http.MaxRedirects <- 2
Http.Timeout <- 10000
Http.KeepAlive <- true
Http.UseCompression <- true
Console.BufferWidth <- 256
Console.BufferHeight <- 768
Console.Title <- "F# Web Crawler"
(* Start the crawler and mem cleaner *)
Async.Start(async{memCleaner()})
crawl url [url]
stdin.Read() |> ignore
This is just a little something I made while I was working on SL to test the performance of the Http class. It'll crawl a webpage, extract the links, filter out the dupes (already crawled ones) and add the new links to a queue. It's based on recursion and is "state-less" (no mutable variables are used to store data/the state). That means no matter how many CPU cores you have, it'll scale linear across all of them with very little performance loss. It doesn't have any practical function but might be a handy reference for people who are looking into functional programming or F#.
Video showing a slightly modified version:
[youtube]c41mmbuXBZQ[/youtube]
Ignore the heavy hiphop tune and other nonsense. I was a bit drunk when I was recording it, lol .