73.gif

Search (advanced search)
Use this Search form before posting, asking or make a new thread.
Tips: Use Quotation mark to search words (eg. "How To Make Money Online")

03-15-2012, 11:54 PM
Post: #1
[GET] *Related Content, Phone # , Email Scraper*
*Related Content, Phone # , Email Scraper*

Code:
http://i.imgur.com/lyGFI.png

Description:

Web-based script written in PHP allowing super simple, but blazing fast scraping of whats important! It offers an easy way to scrape the top 3 search engines (google,yahoo,bing) and their results pages (up to 10 full pages each). It then automatically parses and displays the emails and phone numbers related to the keyword you specify. Its extremely quick, and can scrape and parse 30 pages of SERP's in seconds (make sure to use proxies to avoid bans from the sources). Enjoy!


Features:


- Collect email addresses, phone numbers, and related content for any keyword

- Scrapes google,yahoo,bing simultaneously (Up to 30 Threads at a time!)

- Randomly chooses from a built-in list of common User-Agents for extra anonymity.

- Choose how many pages of SERP's to scrape per source

- Anonymous HTTP Proxy Support

- Randomly chooses new proxies for each thread (yes up to 30)

- "Multi-threaded", blazing fast parallel Scraping using PHP's curl_multi

- Super simple and easy to install even on your windows PC! (wamp stack)

- Under 200 lines of code!


Example Uses:

- Build targeted contact lists (and even sell them if you want )

- Use for SMS or Email marketing (so many ways available)

- Setup macros to auto generate content for your BH needs

- Basic filtering of emails/phone numbers from SERP snippets

- And many more creative ways!


Requirements:


- PHP5 with DOM extension enabled

Installation:

1. Copy php code onto your server (paste the code into a file and name it with .php extension).
2. Access the script in any browser
3. Enjoy!

Screenshots:

Code:
http://i.imgur.com/iKyys.jpg

PHP Code:
<?php

error_reporting
(0);
ini_set('display_errors','0');
class 
BHW_Contact_Scraper{
    
    public 
$results '<h2>Harvested Results</h2>';
    
    public 
$query '"contact us at"';
    
    private 
$_rawKeyword;
        
    private 
$_maxPages;
    
    private 
$_multiHandle;
    
    private 
$_proxies = array();
        
    private 
$_userAgents = array(
        
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4",
        
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4",
        
"Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13",
        
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
        
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB0.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; GACID=)"
    
);
    
    public function 
__construct($keyword,$depth=1,$proxies=''){
        
        if(
$proxies!=''){
            
$proxies preg_replace('/\r/'''$proxies);
            
$this->_proxies explode("\n",$proxies);        
        }
        
        
$this->_maxPages $depth;
        
        
$this->_rawKeyword $keyword;        
        
        
$query urlencode($this->query.$keyword);
        
        
//setup urls to check
        
for($x=0;$x<$this->_maxPages;$x++){
            
///google
            
$urls[] = 'http://www.google.com/search?q='.$query.($x>0?'&start='.($x 10):'');            
            
//yahoo
            
$urls[] = 'http://search.yahoo.com/search?p='.$query.($x>0?'&b='.(($x 10)+1):'');                
            
//bing
            
$urls[] = 'http://bing.com/search?q='.$query.($x>0?'&first='.(($x 10)+1):'');            
        }
        
$this->_setupHandles($urls);
        
        
$this->_crawlAll();
        
        
$this->_process();
    }
    
    private function 
_setupHandles($urls){
        
        
$this->_multiHandle curl_multi_init();
        
        if(
is_array($urls)){
            foreach(
$urls as $url){                
                
$ch curl_init();
                
$options = array();
                
$options[CURLOPT_URL]                 = $url;
                
$options[CURLOPT_USERAGENT]         = mt_rand(0,count($this->_userAgents)-1);
                
$options[CURLOPT_FOLLOWLOCATION]    = 1;
                
$options[CURLOPT_RETURNTRANSFER]     = 1;
                
$options[CURLOPT_TIMEOUT]             = 10;
                
$options[CURLOPT_CONNECTTIMEOUT]     = 10;
                
                
//proxy support
                
if(count($this->_proxies) > 0){
                    
$proxy $this->_proxies[mt_rand(0,count($this->_proxies)-1)];
                    
$p explode(':',$proxy);
                    
$options[CURLOPT_PROXY] = $p[0];
                    
$options[CURLOPT_PROXYPORT] = $p[1];                    
                }
                
                
curl_setopt_array($ch$options);
                
$this->_handles[] = $ch;
                
curl_multi_add_handle($this->_multiHandle$ch);

            }
        }
    }
    private function 
_crawlAll(){
        
$mh $this->_multiHandle;
        
$active null;        
        do{
$mrc curl_multi_exec($mh$active);}while ($mrc == CURLM_CALL_MULTI_PERFORM);
        
//run in parallel
        
while ($active and& $mrc == CURLM_OK){
            if (
curl_multi_select($mh) != -1){
                do{
                    
$mrc curl_multi_exec($mh$active);
                }
                while (
$mrc == CURLM_CALL_MULTI_PERFORM);
            }
        }
    }
    
    private function 
_process(){
        
        
$snippets '';
        
        foreach(
$this->_handles as $ch){
            
$html curl_multi_getcontent($ch);            
            if(
$html == ''){
                
$e curl_error($ch);
                if(
stristr($e,"Couldn't resolve proxy")){
                    echo 
"<h1 style=\"color:red;\">ERROR: $e</h1>";
                    return;
                }
                
            }
            
$curlInfo curl_getinfo($ch);
            if(
preg_match('/google/',$curlInfo['url'])){
                
$xquery '//li[@class="g"]/div';
                
$t='g';
            }
            if(
preg_match('/yahoo/',$curlInfo['url'])){
                
$xquery '//div[@id="web"]/ol/li/div/div[@class="abstr"]';
                
$t='y';
            }
            if(
preg_match('/bing/',$curlInfo['url'])){
                
$xquery '//div[@id="results_container"]/div[@id="results"]/ul/li/div/p';
                
$t='b';
            }
            
$dom = new DOMDocument();
            @
$dom->loadHTML($html);
            
$xpath = new DOMXPath($dom);        
            
$result $xpath->query($xquery);                    
            for (
$i 0$i $result->length$i++) {
                
$snippets .= ' '.$result->item($i)->nodeValue.' ';
                
$snips[$t] .= ' '.$result->item($i)->nodeValue.' ';                
            }
        }
        
        
//numbers
        
preg_match_all('/[^0-9]([0-9]{3})[^0-9]*?([0-9]{3})[^0-9]*?([0-9]{4})[^0-9]/'$snippets$resultPREG_PATTERN_ORDER);        
        foreach(
$result[0] as $i => $n){
            
$sets = array($result[1][$i],$result[2][$i],$result[3][$i]);
            
$numbers[] = implode('-',$sets);
        }
        
$numbers array_unique($numbers);
        
        
//emails
        
preg_match_all('/([a-zA-Z_.0-9]{1,}?)@([a-zA-Z\-0-9]{2,}?)\.(([a-zA-Z\-0-9]{3})|(co\.uk))/'$snippets$resultPREG_PATTERN_ORDER);        
        foreach(
$result[0] as $i => $n){            
            
$emails[] = $result[1][$i].'@'.$result[2][$i].'.'.$result[3][$i];
        }
        
$emails array_unique($emails);

        
$this->results .= '<h3>Numbers Related to "'.$this->_rawKeyword.'"</h3>'.implode('<br/>',$numbers);
        
$this->results .= '<h3>Emails Related to "'.$this->_rawKeyword.'"</h3>'.implode('<br/>',$emails);
        
$this->results .= '<h3>Google Content Related to "'.$this->_rawKeyword.'"</h3>'.$snips['g'];
        
$this->results .= '<h3>Yahoo Content Related to "'.$this->_rawKeyword.'"</h3>'.$snips['y'];        
        
$this->results .= '<h3>Bing Content Related to "'.$this->_rawKeyword.'"</h3>'.$snips['b'];    
        
$this->results .= '<h3>Combined Raw SERP Content Related to "'.$this->_rawKeyword.'"</h3>'.$snippets;                    
    }
    
}

    if(
$_POST['submit']){
        if(isset(
$_POST['keyword']) and& !empty($_POST['keyword'])){
            
$scraper = new BHW_Contact_Scraper($_POST['keyword'],$_POST['pagedepth'],$_POST['proxies']);
            
$results $scraper->results;    
        }else{
?>
            <h1 style="color:red; font-weight: bold;">YOU FORGOT THE KEYWORD</h1>
        <?php }
    }
?>
        
<html>
    <h2>R elated Content, Phone # , Email Scraper by Amrak @ BHW</h2>
    
    <form action="http://<?php echo $_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']?>" method="post">    
    
    <label for="keyword">Keyword:</label>
    <input style="width:358px;" name="keyword" type="text" value="<?php echo $_POST['keyword'];?>" />
    
    <br/>
    Anonymous HTTP Proxies (optional):
    <br/>
    <textarea placeholder="host:port (one per line)" name="proxies" type="text" rows="5" cols="50"><?php echo $_POST['proxies'];?></textarea>    
    <br/>        
    <label for="keyword">Pages Per Source:</label>
    <select  name="pagedepth">
    <?php for($x=1;$x<11;$x++){?>
        <option <?php echo ($_POST['pagedepth']==$x?'selected="selected"':'')?> value="<?php echo $x;?>"><?php echo $x;?></option>
    <?php }?>
    </select>(*USE WITH CAUTION*)
    <br />
    
    <br />    
    <input name="submit" type="submit" />
    </form>
    <br />
    <?php echo($results?$results:'');?>
</html> 




10.gif