web scraping part 1

This script download all emails of all recruiters. 

=============================================
using simple regex. 
=============================================
#!/usr/bin/perl

use strict;
use warnings;
use Data::Dumper;
use LWP::Simple;




my $html =  get "$url";

my @html = split(/\n/,$html);

for my $i  (@html){
    next unless  $i =~ /<a href="\/recruiter\/(\S.*\/)">/ ;
    print "${url}${1}\n";
    my $indiv_recruiter = get("$url$1");
    my @indiv_recruiter = split(/\n/,$indiv_recruiter);
    for my $j (@indiv_recruiter ){

     next unless $j =~ /<a class="recruiter-email-link" href="mailto:(\S.*?\@CyberCoders\.com)">/ ;
        print "$1\n"

    }
}

=========================================================
Using HTM::treebuilder::Xpath

#!/usr/bin/perl

use strict;
use warnings;
use Data::Dumper;
use LWP::Simple;
use HTML::TreeBuilder;
use HTML::TreeBuilder::XPath;




my $html =  get "$url";



my $tree = HTML::TreeBuilder::XPath->new_from_content();
$tree->parse($html);

my @customers = $tree->findnodes('//div[@class="recruiter-item"');

for my $custm (@customers){
    my @all_a_tag = $custm->look_down('_tag' => 'a'  ) ;
     my $recruiter_pth = $all_a_tag[0]->attr('href') ."\n" ;
      $recruiter_pth =~ s/\/recruiter\///g;
      get_email($recruiter_pth);

}


sub get_email {

my ($r_pth)=@_;
my $html =  get ("$url$r_pth") ;
my    $tree2 = HTML::TreeBuilder->new_from_content($html);
    my @mailto = $tree2->look_down( _tag => 'a' , class => 'recruiter-email-link' );
     print $mailto[0]->attr('href')."\n";
   print "get $url$r_pth";
}

No comments:

Other Articles

Enter your email address: