*Related Content, Phone # , Email Scraper*
Code:
http://i.imgur.com/lyGFI.png
Description:
Web-based script written in PHP allowing super simple, but blazing fast scraping of whats important! It offers an easy way to scrape the top 3 search engines (google,yahoo,bing) and their results pages (up to 10 full pages each). It then automatically parses and displays the emails and phone numbers related to the keyword you specify. Its extremely quick, and can scrape and parse 30 pages of SERP's in seconds (make sure to use proxies to avoid bans from the sources). Enjoy!
Features:
- Collect email addresses, phone numbers, and related content for any keyword
- Scrapes google,yahoo,bing simultaneously (Up to 30 Threads at a time!)
- Randomly chooses from a built-in list of common User-Agents for extra anonymity.
- Choose how many pages of SERP's to scrape per source
- Anonymous HTTP Proxy Support
- Randomly chooses new proxies for each thread (yes up to 30)
- "Multi-threaded", blazing fast parallel Scraping using PHP's curl_multi
- Super simple and easy to install even on your windows PC! (wamp stack)
- Under 200 lines of code!
Example Uses:
- Build targeted contact lists (and even sell them if you want )
- Use for SMS or Email marketing (so many ways available)
- Setup macros to auto generate content for your BH needs
- Basic filtering of emails/phone numbers from SERP snippets
- And many more creative ways!
Requirements:
- PHP5 with DOM extension enabled
Installation:
1. Copy php code onto your server (paste the code into a file and name it with .php extension).
2. Access the script in any browser
3. Enjoy!
Screenshots:
Code:
http://i.imgur.com/iKyys.jpg
PHP Code:
<?php
error_reporting(0);
ini_set('display_errors','0');
class BHW_Contact_Scraper{
public $results = '<h2>Harvested Results</h2>';
public $query = '"contact us at"';
private $_rawKeyword;
private $_maxPages;
private $_multiHandle;
private $_proxies = array();
private $_userAgents = array(
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB0.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; GACID=)"
);
public function __construct($keyword,$depth=1,$proxies=''){
if($proxies!=''){
$proxies = preg_replace('/\r/', '', $proxies);
$this->_proxies = explode("\n",$proxies);
}
$this->_maxPages = $depth;
$this->_rawKeyword = $keyword;
$query = urlencode($this->query.$keyword);
//setup urls to check
for($x=0;$x<$this->_maxPages;$x++){
///google
$urls[] = 'http://www.google.com/search?q='.$query.($x>0?'&start='.($x * 10):'');
//yahoo
$urls[] = 'http://search.yahoo.com/search?p='.$query.($x>0?'&b='.(($x * 10)+1):'');
//bing
$urls[] = 'http://bing.com/search?q='.$query.($x>0?'&first='.(($x * 10)+1):'');
}
$this->_setupHandles($urls);
$this->_crawlAll();
$this->_process();
}
private function _setupHandles($urls){
$this->_multiHandle = curl_multi_init();
if(is_array($urls)){
foreach($urls as $url){
$ch = curl_init();
$options = array();
$options[CURLOPT_URL] = $url;
$options[CURLOPT_USERAGENT] = mt_rand(0,count($this->_userAgents)-1);
$options[CURLOPT_FOLLOWLOCATION] = 1;
$options[CURLOPT_RETURNTRANSFER] = 1;
$options[CURLOPT_TIMEOUT] = 10;
$options[CURLOPT_CONNECTTIMEOUT] = 10;
//proxy support
if(count($this->_proxies) > 0){
$proxy = $this->_proxies[mt_rand(0,count($this->_proxies)-1)];
$p = explode(':',$proxy);
$options[CURLOPT_PROXY] = $p[0];
$options[CURLOPT_PROXYPORT] = $p[1];
}
curl_setopt_array($ch, $options);
$this->_handles[] = $ch;
curl_multi_add_handle($this->_multiHandle, $ch);
}
}
}
private function _crawlAll(){
$mh = $this->_multiHandle;
$active = null;
do{$mrc = curl_multi_exec($mh, $active);}while ($mrc == CURLM_CALL_MULTI_PERFORM);
//run in parallel
while ($active and& $mrc == CURLM_OK){
if (curl_multi_select($mh) != -1){
do{
$mrc = curl_multi_exec($mh, $active);
}
while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
}
private function _process(){
$snippets = '';
foreach($this->_handles as $ch){
$html = curl_multi_getcontent($ch);
if($html == ''){
$e = curl_error($ch);
if(stristr($e,"Couldn't resolve proxy")){
echo "<h1 style=\"color:red;\">ERROR: $e</h1>";
return;
}
}
$curlInfo = curl_getinfo($ch);
if(preg_match('/google/',$curlInfo['url'])){
$xquery = '//li[@class="g"]/div';
$t='g';
}
if(preg_match('/yahoo/',$curlInfo['url'])){
$xquery = '//div[@id="web"]/ol/li/div/div[@class="abstr"]';
$t='y';
}
if(preg_match('/bing/',$curlInfo['url'])){
$xquery = '//div[@id="results_container"]/div[@id="results"]/ul/li/div/p';
$t='b';
}
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$result = $xpath->query($xquery);
for ($i = 0; $i < $result->length; $i++) {
$snippets .= ' '.$result->item($i)->nodeValue.' ';
$snips[$t] .= ' '.$result->item($i)->nodeValue.' ';
}
}
//numbers
preg_match_all('/[^0-9]([0-9]{3})[^0-9]*?([0-9]{3})[^0-9]*?([0-9]{4})[^0-9]/', $snippets, $result, PREG_PATTERN_ORDER);
foreach($result[0] as $i => $n){
$sets = array($result[1][$i],$result[2][$i],$result[3][$i]);
$numbers[] = implode('-',$sets);
}
$numbers = array_unique($numbers);
//emails
preg_match_all('/([a-zA-Z_.0-9]{1,}?)@([a-zA-Z\-0-9]{2,}?)\.(([a-zA-Z\-0-9]{3})|(co\.uk))/', $snippets, $result, PREG_PATTERN_ORDER);
foreach($result[0] as $i => $n){
$emails[] = $result[1][$i].'@'.$result[2][$i].'.'.$result[3][$i];
}
$emails = array_unique($emails);
$this->results .= '<h3>Numbers Related to "'.$this->_rawKeyword.'"</h3>'.implode('<br/>',$numbers);
$this->results .= '<h3>Emails Related to "'.$this->_rawKeyword.'"</h3>'.implode('<br/>',$emails);
$this->results .= '<h3>Google Content Related to "'.$this->_rawKeyword.'"</h3>'.$snips['g'];
$this->results .= '<h3>Yahoo Content Related to "'.$this->_rawKeyword.'"</h3>'.$snips['y'];
$this->results .= '<h3>Bing Content Related to "'.$this->_rawKeyword.'"</h3>'.$snips['b'];
$this->results .= '<h3>Combined Raw SERP Content Related to "'.$this->_rawKeyword.'"</h3>'.$snippets;
}
}
if($_POST['submit']){
if(isset($_POST['keyword']) and& !empty($_POST['keyword'])){
$scraper = new BHW_Contact_Scraper($_POST['keyword'],$_POST['pagedepth'],$_POST['proxies']);
$results = $scraper->results;
}else{?>
<h1 style="color:red; font-weight: bold;">YOU FORGOT THE KEYWORD</h1>
<?php }
}
?>
<html>
<h2>R elated Content, Phone # , Email Scraper by Amrak @ BHW</h2>
<form action="http://<?php echo $_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']?>" method="post">
<label for="keyword">Keyword:</label>
<input style="width:358px;" name="keyword" type="text" value="<?php echo $_POST['keyword'];?>" />
<br/>
Anonymous HTTP Proxies (optional):
<br/>
<textarea placeholder="host:port (one per line)" name="proxies" type="text" rows="5" cols="50"><?php echo $_POST['proxies'];?></textarea>
<br/>
<label for="keyword">Pages Per Source:</label>
<select name="pagedepth">
<?php for($x=1;$x<11;$x++){?>
<option <?php echo ($_POST['pagedepth']==$x?'selected="selected"':'')?> value="<?php echo $x;?>"><?php echo $x;?></option>
<?php }?>
</select>(*USE WITH CAUTION*)
<br />
<br />
<input name="submit" type="submit" />
</form>
<br />
<?php echo($results?$results:'');?>
</html>