Ultimate PHP Scraper V1 (parse websites). V1

Status
Not open for further replies.

litewarez

Active Member
1,367
2008
1
0
Heya all,


Heres a scraper that i created that you can use to split websites html source into different arrays

Currently supports:


  • doctype
  • doc title
  • keywords
  • link rel
  • external css
  • h1
  • h2
  • h3
  • h4
  • h5
  • h6
  • p
  • a content
  • a href
  • a href count
  • a additionaltags
  • span
  • script
  • ul
  • li
  • comments
  • ids
  • classes
  • meta content
  • styles
  • tag titles
  • image alt
  • images
  • mailto
  • emails
  • count keywords
HERES THE CLASS FILE

Save this as parse.php
PHP:
<?php
class ParseSite{

    var $DataFromSite = '';

    function __construct($url){
        $this->url = $url;
        $this->DataFromSite = $this->grab_page();
    }

    private function grab_page(){
        $this->CurlOP = array(
            CURLOPT_RETURNTRANSFER    => true,    // return web page
            CURLOPT_HEADER        => false,    // don't return headers
            CURLOPT_FOLLOWLOCATION    => true,    // follow redirects
            CURLOPT_ENCODING    => "",        // handle all encodings
            CURLOPT_USERAGENT    => "LWS V1.0",    // who am i
            CURLOPT_AUTOREFERER    => true,    // set referer on redirect
            CURLOPT_CONNECTTIMEOUT    => 120,        // timeout on connect
            CURLOPT_TIMEOUT        => 120,        // timeout on response
            CURLOPT_MAXREDIRS    => 10,        // stop after 10 redirects
            CURLOPT_SSL_VERIFYHOST    => 0,        // don't verify ssl
            CURLOPT_SSL_VERIFYPEER    => false,    //
        );
        $this->ch = curl_init($this->url);
        curl_setopt_array($this->ch,$this->CurlOP);
        $this->Data = curl_exec($this->ch);
        curl_close($this->ch);
        return $this->Data;
    } 
    
    function get_doctype(){
        $h1tags = preg_match('/<!DOCTYPE (\w.*)dtd">/is',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[0]);
        array_push($res,count($patterns[0]));
        return $res;
    }

    // retrieve page title
    function get_doc_title(){
        $h1tags = preg_match('/<title> ?.* <\/title>/isx',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[0]);
        array_push($res,count($patterns[0]));
        return $res;
    }

    // retrieve keywords
    function get_keywords(){
        $h1tags = preg_match('/(<meta name="keywords" content="(.*)" \/>)/i',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // get rel links in header of the site
    function get_link_rel(){
        $h1tags = preg_match_all('/(rel=)(".*") href=(".*")/im',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    function get_external_css(){
        $h1tags = preg_match_all('/(href=")(\w.*\.css)"/i',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve all h1 tags
    function get_h1(){
        $h1tags = preg_match_all("/(<h1.*>)(\w.*)(<\/h1>)/isxmU",$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve all h2 tags
    function get_h2(){
        $h1tags = preg_match_all("/(<h2.*>)(\w.*)(<\/h2>)/isxmU",$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve all h3 tags
    function get_h3(){
        $h1tags = preg_match_all("/(<h3.*>)(\w.*)(<\/h3>)/ismU",$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve all h4 tags
    function get_h4(){
        $h1tags = preg_match_all("/(<h4.*>)(\w.*)(<\/h4>)/ismU",$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
    return $res;
    }
    
    // retrieve all h5 tags
    function get_h5(){
        $h1tags = preg_match_all("/(<h5.*>)(\w.*)(<\/h5>)/ismU",$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }

    // retrieve all h5 tags
    function get_h6(){
        $h1tags = preg_match_all("/(<h6.*>)(\w.*)(<\/h6>)/ismU",$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve p tag contents
    function get_p(){
        $h1tags = preg_match_all("/(<p.*>)(\w.*)(<\/p>)/ismU",$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve names of links
    function get_a_content(){
        $h1count = preg_match_all("/(<a.*>)(\w.*)(<.*>)/ismU",$this->DataFromSite,$patterns);
        return $patterns[2];
    }
    
    // retrieve link destinations
    function get_a_href(){
        $h1count = preg_match_all('/(href=")(.*?)(")/i',$this->DataFromSite,$patterns);
        return $patterns[2];
    }
    
    // get count of href's
    function get_a_href_count(){
        $h1count = preg_match_all('/<(a.*) href=\"(.*?)\"(.*)<\/a>/',$this->DataFromSite,$patterns);
        return count($patterns[0]);
    }
    
    //get all additional tags inside a link tag
    function get_a_additionaltags(){
        $h1count = preg_match_all('/<(a.*) href="(.*?)"(.*)>(.*)(<\/a>)/',$this->DataFromSite,$patterns);
        return $patterns[3];
    }
    
    // retrieve span's
    function get_span(){
        $h1count = preg_match_all('/(<span .*>)(.*)(<\/span>)/',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve spans on the site
    function get_script(){
        $h1count = preg_match_all('/(<script.*>)(.*)(<\/script>)/imxsU',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve content of ul's
    function get_ul(){
        $h1count = preg_match_all('/(<ul \w*>)(.*)(<\/ul>)/ismxU',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    //retrieve li contents
    function get_li(){
        $h1count = preg_match_all('/(<li \w*>)(.*)(<\/li>)/ismxU',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve page comments
    function get_comments(){
        $h1count = preg_match_all('/(<!--).(.*)(-->)/isU',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve all used id's on the page
    function get_ids(){
        $h1count = preg_match_all('/(id="(\w*)")/is',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve all used classes ( inline ) of the document
    function get_classes(){
        $h1count = preg_match_all('/(class="(\w*)")/is',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // get the meta tag contents
    function get_meta_content(){
        $h1count = preg_match_all('/(<meta)(.*="(.*)").\/>/ix',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // get inline styles
    function get_styles(){
        $h1count = preg_match_all('/(style=")(.*?)(")/is',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // get titles of tags
    function get_tag_titles(){
        $h1count = preg_match_all('/(title=)"(.*)"(.*)/',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // get image alt descriptions
    function get_image_alt(){
        $h1count = preg_match_all('/(alt=.)([a-zA-Z0-9\s]{1,})/',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[2]);
        array_push($res,count($patterns[2]));
        return $res;
    }
    
    // retrieve images on the site
    function get_images(){
        $h1count = preg_match_all('/(<img)\s (src="([a-zA-Z0-9\.;:\/\?&=_|\r|\n]{1,})")/isxmU',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[3]);
        array_push($res,count($patterns[3]));
        return $res;
    }
    
    // retrieve email address of the mailto tag if any
    function get_mailto(){
        $h1count = preg_match_all('/(<a\shref=")(mailto:)([a-zA-Z@0-9\.]{1,})"/ims',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[3]);
        array_push($res,count($patterns[3]));
        return $res;
    }
    
    // retrieve any email
    function get_emails(){
    $h1count = preg_match_all('/[a-zA-Z0-9_-]{1,}@[a-zA-Z0-9-_]{1,}\.[a-zA-Z]{1,4}/',$this->DataFromSite,$patterns);
        $res = array();
        array_push($res,$patterns[0]);
        array_push($res,count($patterns[0]));
        return $res;
    }

    // count used keywords
    function countkeyword($word){
        $x = preg_match_all("/(.*)($word)(.*)/",$this->DataFromSite,$patterns);
        return count($patterns);
    }

    // retrieve just the name without www and com/eu/de etc
    function get_domain_name_only(){
        $match = preg_match("/(.*:\/\/)\w{0,}(.*)\.(.*)/",$this->url,$patterns);
        $patterns[2] = str_replace(".","",$patterns[2]);
        return $patterns[2];
    } 
}

?>
EXAMPLE CODE:

PHP:
<?php 

include 'parse.php';
$Parse = new ParseSite("http://www.phazeddl.com");
echo "<pre>";
var_dump(
    $Parse->get_doctype(),
    $Parse->get_doc_title(),
    $Parse->get_keywords(),
    $Parse->get_link_rel(),
    $Parse->get_external_css(),
    $Parse->get_h1(),
    $Parse->get_h2(),
    $Parse->get_h3(),
    $Parse->get_h4(),
    $Parse->get_h5(),
    $Parse->get_h6(),
    $Parse->get_p(),
    $Parse->get_a_content(),
    $Parse->get_a_href(),
    $Parse->get_a_href_count(),
    $Parse->get_a_additionaltags(),
    $Parse->get_span(),
    $Parse->get_script(),
    $Parse->get_ul(),
    $Parse->get_li(),
    $Parse->get_comments(),
    $Parse->get_ids(),
    $Parse->get_classes(),
    $Parse->get_meta_content(),
    $Parse->get_styles(),
    $Parse->get_tag_titles(),
    $Parse->get_image_alt(),
    $Parse->get_images(),
    $Parse->get_mailto(),
    $Parse->get_emails(),
    $Parse->countkeyword("warez"),
    $Parse->get_domain_name_only()
);
echo "</pre>";
?>
Hope you enjoy and please comment and report bugs :P

thanks
 
1 comment
Status
Not open for further replies.
Back
Top