#! /usr/bin/perl

#
# Author: Sebastian Enger, M.Sc.
# Date: 3/27/2016
# Website: www.OneTIPP.com
# Email: Sebastian.Enger@gmail.com
# Topic: Parse content of current "Versicherungswebsite" for OneTIPP Demo Framework
# Version: 0.0.4
#

use strict;
use DBI;
use File::Find;	# perl -MCPAN -e 'force install "File::Find"'
use File::Copy;	# perl -MCPAN -e 'force install "File::Copy"'
use File::Path;	# perl -MCPAN -e 'force install "File::Path"'
#use XML::Code; # perl -MCPAN -e 'force install "Curses"'
use File::Basename;
use Data::Dumper;
use String::Random; # perl -MCPAN -e 'force install "String::Random"'
use Digest::SHA qw(sha512_hex);
use Encode qw/from_to/;
use Encode::Encoder qw(encoder);

my $scriptname 		= basename($0);
my $count 			= 0;
my $pass 			= new String::Random;

my $SourceFolder 	= '/home/Framework/DemoParser/rawdata/www.feuersozietaet.de';
my $log_path		= "/home/Framework/DemoParser/log";
my $string1;
my $catg;
my $title;
my $string;
my $rcount = 0;
my $Hash;
my $dbh;
my %Hash = ();

my $enc 		= 'latin1'; # in dieser Kodierung ist das Script gespeichert
my $logfile 	= "$log_path/$scriptname.txt";
unlink $logfile;
my @array  = ();

find(\&SourceStructure, $SourceFolder);

my %hash   = map { $_ => 1 } @array;
my @unique = keys(%hash);

open(L,">$logfile");
	binmode(L);
	foreach my $w (@unique){
		#	next if (length($w)<90);
		next if ($w !~ m/(\.$|\!$|\?$){1}/);
		print L "$w\n";
		print "< " .length($w) . " > $w\n";
	}
close L;

sub SourceStructure(){

    # $_ is set to the current file name
    # $File::Find::dir is set to the current directory
    # $File::Find::name is set to "$File::Find::dir/$_"
    # you are chdir()'d to $File::Find::dir 

	my $SourceFilename		= $File::Find::name;
	my $SourceDirname		= $File::Find::dir;
	my $string = "";
	
	# if ( $SourceFilename =~ /\.html/ig && $SourceFilename !~ /trackback/ig){ #$SourceFilename =~ /\.(html|htm|shtm|shtml|php)/ig ){
	if ( $SourceFilename !~ /(\.asp|\.aspx|\?|\=)/ig && $SourceFilename =~ /\.htm/ig){	
		
		{
			local $/ = undef;
			open FILE, "$SourceFilename" or return "Couldn't open file: $SourceFilename $!";
			binmode FILE;
			$string = <FILE>;
			close FILE;
		}

		print "< " .length($string) . " > Working on : $SourceFilename \n";	
		
		my (@texte1) = ($string =~ m#\>\s*(.*?)\s*\<#igs);
		
		foreach my $e (@texte1){
			$e =~ s/&auml;/ä/ig;
			$e =~ s/&uuml;/ü/ig;
			$e =~ s/&ouml;/ö/ig;
			#$e =~ s/&auml;/ä/ig;
			
			next if ($e =~ m/(var|\=|\#|\}|\{|px;|javascript|\(\)|\&|(\w)\.(\w))/ig);
			#next if (length($e)<90);
			next if ($e !~ m/(\.$|\!$|\?$){1}/);
			
			#$e = encoder($e)->latin1;
			from_to($e,"latin1","utf8");
			
			push(@array, $e);
			
		}

	}; # if ( $SourceFilename =~ /\.txt/ig ){#&& $SourceDirname =~ /Team/ig){
	
}; # sub SourceStructure(){
sub strip_tags(){
	my $content = shift;
	$content =~ s|<.+?>||ig;
	$content =~ s|content="||ig;
	$content =~ s|"||ig;
	return $content;
}
sub strip_tags_array(){
	my @arr = shift;
	my @retArray = ();
	foreach my $content (@arr){
	#my $content = shift;
	$content =~ s|<.+?>||ig;
	$content =~ s|.+?>||ig;
	$content =~ s|content="||ig;
	$content =~ s|"||ig;
	push(@retArray, $content);
	#$content =~ s#<([^">]+(?:"[^"]+")*[^>]+)>##ig;
	}
	return @retArray;
}
sub links(){
	my $content = shift;
	my @links = $content =~ /href\s*=\s*"?([^"\s>]+)/gis;
	return @links;
}

my (@texte1) = ($string =~ m#<td colspan="3" id="td_ausfall_text" class="hilfehidden">\s*(.*?)\s*</td></tr>#igs);
		
		my (@texte2) = ($string =~ m#<td colspan="3" id="td_laufzeit_text" class="hilfehidden">\s*(.*?)\s*</td></tr>#igs);
		
		my (@texte3) = ($string =~ m#<a id="gesell_close" href="javascript:showfaq('box_gesell')">\s*(.*?)\s*</div><p class="pfaqlink">#igs);

		my (@texte4) = ($string =~ m#<td class="txt">\s*(.*?)\s*<a style="position:relative" name="frage_1" id="frage_1">#igs);

		my (@texte5) = ($string =~ m#<a style="position:relative" name="frage_2" id="frage_2">\s*(.*?)\s*</center></div>#igs);
		
		my (@texte6) = ($string =~ m#<h2 style="font-size:12px; color: \#fff; background: \#933; padding:3px 10px;">\s*(.*?)\s*</div><!--TYPO3SEARCH_end--></div>#igs);
