extract data from html table using HTML::TableExtract

in this tutorial you will get to learn : 

1) How to extract data from a html table 
2 ) regex 
3)  appending a hash to a hash 
4) for loops 

----------------------------------------------------------------------------------------------
HTML file 
----------------------------------------------------------------------------------------------
<table class="table1" border="1" cellspacing="1" cellpadding="1" style="width: 1144px; height: 883px;">

<tr>
<td> <b>Dom0<br /></b>
</td><td> <b>DomU <br /></b>
</td><td> <b>ipaddress<br /></b>
</td><td> <b>server name <br /></b>
</td><td> <b>application <br /></b>
</td><td> <b>website <br /></b>
</td><td> <b>assigned to </b><br />
</td></tr>
<tr>
<td> S1 (1.51)<br />
</td><td> yourwebsitenews-db 10GB ram 100 GB HDD &nbsp;&nbsp;
</td><td> 1.0.0.101<br />
</td><td> yourwebsitenews-db
</td><td> win2k8 ,sql
</td><td> MSSQL Server
</td><td> yourwebsitenews
</td></tr>
<tr>
<td> <br />
</td><td> yourwebsitetvweb3&nbsp; 4GB ram 48 GB HDD<br />
</td><td> 9.9.02.132<br />
</td><td> yourwebsitetvweb3<br />
</td><td> win2k8,IIS<br />
</td><td> yourwebsite TV <br />
</td><td> yourwebsitetv<br />
</td></tr>
<tr>
<td> <br />
</td><td> yourwebsitetvweb4&nbsp; 4GB ram 30 GB HDD<br />
</td><td> 9.9.202.146<br />
</td><td> yourwebsitetvweb4<br />
</td><td> win2k8,IIS<br />
</td><td> yourwebsite TV <br />
</td><td> yourwebsitetv<br />
</td></tr>
<tr>
<td> <br />
</td><td> <br />
</td><td> <br />
</td></tr>
<tr>
<td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td></tr>
<tr>
<td> S2 (1.52) <br />
</td><td> web1 5.5GB ram 50GB hdd<br />
</td><td> 1.0.0.106<br />
</td><td> web1 <br />
</td><td> apache,php,memcache<br />
</td><td> someweb,cndas,lms.someweb.com<br />
</td><td> IWPL
</td></tr>
<tr>
<td> <br />
</td><td> web4 5.5GB ram 50GB hdd<br />
</td><td> 1.0.0.109<br />
</td><td> web4<br />
</td><td> apache,php,memcache<br />
</td><td> b.in,gs.in.com
</td><td> IWPL
</td></tr>
<tr>
<td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td></tr>
<tr>
<td> S3 (1.53) <br />
</td><td> DB1 11GB ram 100GB hdd <br />
</td><td> 1.0.0.111<br />
</td><td> db1<br />
</td><td> mysql <br />
</td><td> all commons (auth  ),someweb<br />
</td><td> IL
</td></tr>
<tr>
<td> <br />
<tr>
<td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td></tr>
<tr>
<td> S4 (1.54) <br />
</td><td> DB3 10GB ram 200GB hdd<br />
</td><td> 1.0.0.116<br />
</td><td> db3<br />
</td><td> mysql <br />
</td><td> ors_db,ls_ting
</td><td> IWPL
</td></tr>
<tr>
<td> <br />
</td><td> yourwebsitetvmemcach-1 10 Gb ram 24 Gb hdd<br />
</td><td> 9.9.202.134<br />
</td><td> yourwebsitetvmem-1<br />
</td><td> yourwebsite Tv Memcached Server-1<br />
</td><td> yourwebsite Tv Memcached Server-1<br />
</td><td> yourwebsitetv
</td></tr>
<tr>
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td></tr>
<tr>
<td> S5 (1.55)<br />
</td><td> 1 systems ops 8GB 100GB <br />
</td><td> 1.0.0.121<br />
</td><td> ops<br />
</td><td> mysql,php,apache<br /><br />
</td><td> odw <br /> opsview <br /> reports <br /> runtime <br />
</td><td> IWPL
</td></tr>
<tr>
<td> <br />
<td> <br />
<td> <br />
</td><td> cache 4GB ram 129GB hdd<br />
</td><td> 9.9.202.209<br />
</td><td> linux-yourwebsite<br />
</td><td> php,apache<br />
</td><td> yourwebsite <br />
</td><td> yourwebsitetv
</td></tr>
<tr>
<td> <br />
</td><td> cache 6GB ram 20GB hdd<br />
</td><td> 1.0.0.125<br />
</td><td> cache-9<br />
</td><td> backup server
</td><td> backup server
</td><td> IL
</td></tr>
<tr>
<td> <br />
</td><td> web16- 5 GB RAM 46GB HDD
</td><td> 1.0.0.103
</td><td> web16
</td><td> php,apache
</td><td> voiofa.com,solr instance
</td><td> Il
</td></tr>
<tr>
<td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td><td> <br />
</td></tr>
<tr>
<td> S6 (1.56) <br />
</td><td> web9 5GBram <br />
</td><td> 1.0.0.126<br />
</td><td> web9 <br />
</td><td> apache,php,memcache<br />
</td><td> Il
</td></tr>
<tr>
<td> <br />
</td><td> web10 <br />
</td><td> 1.0.0.127<br />
</td><td> web10<br />
</td><td> apache,php,memcache<br />
</td><td> Il
</td></tr>
<tr>
<td> <br />
</td><td> r1s6v1web13&nbsp;5-GB ram - 50-GB hdd<br />
</td><td> 1.0.0.129<br />
</td><td> web13<br />
</td><td> apache,php,memcache
</td><td> critry,m.try,admin.ccry.com
</td><td> IL
</td></tr></table>


==================================================================
PERL script 
==================================================================
#!/usr/bin/perl 

use strict;
use warnings; 
use Data::Dumper;
use LWP::Simple;
use HTML::TableExtract;


#host your table on a local site 




my @data = get_data($url) ;

shift(@data);
my %serverdata;    
my %serverdata_sub;    

my $DOM0;

my @aeey;

my $re1='((?:[a-z][a-z]*[0-9]+[a-z0-9]*))';  # Alphanum 1
my $re2='(\\s+)';  # White Space 1
my $re3='\\((.*)\\)';  # Round Braces 1


foreach my $y (@data) {
  next unless ($y->[2]);
  $DOM0 = ($y->[0])? $y->[0] : $DOM0;

  my $re=$re1.$re2.$re3; 
  ($DOM0 =  $DOM0)  =~ s/$re/172.16.$3/isg; # S1 (1.51)
#   Dom0            DomU                ipaddress  server name   application   website   assigned to 
#  "$DOM0 ----->   $y->[2] ========== > $y->[1],   $y->[3],      $y->[4]  $y->[5]    $y->[6]
 ";

    push (@{$serverdata{$DOM0}{$y->[2]}},  {  'serverdetails' => $y->[1] , 'servername' => $y->[3], 'application' => $y->[4] , 'websites' => $y->[5] } ) ;


  if ( $y->[0]  ) { 
    $DOM0= "$y->[0]"; 
  }else { 
    $DOM0 = $DOM0;
  } 
}

foreach my $keysi ( keys %serverdata) { 
  print "================== $keysi ===============\n";
  print Dumper ($serverdata{$keysi}) ;
  print "==================  ===============\n";




sub get_data {
  my $sub_url = $_[0];
  my $html =  get "$sub_url";

  my @arry;
  my $te = new HTML::TableExtract( depth => 0, count => 0, attribs => { 'class' => "table1" }   );
  $te->parse($html);
  foreach my  $ts ($te->tables) {
    foreach my $row ($ts->rows) {
#      next unless $row->[0] =~ /\w/;   # skip garbage rows
      @{$row} =  grep(s/\n\n|(^\s+|\s+$)//g, @{$row});
      push( @arry , [@{$row}] ) ;
#   print "$row->[0] , $row->[1],$row->[2],$row->[3]\n";
    }
  }
  return @arry ;
}


No comments:

Other Articles

Enter your email address: