#!/usr/bin/perl
# Copyright (c) 2010-2013 Index Data, http://www.indexdata.com
#
# ppstat - extracts reliablity reports from pazpar2.log
#

use Data::Dumper;
use Getopt::Long;
use Text::CSV;
use IO::File;
use File::stat;

use strict;
use warnings;

use vars qw($VERSION);
$VERSION = 1.8;

my $debug = 0;
my $help;
my $display_step_errors  = 0;
my $display_query_errors = 0;
my $display_time         = 0;
my $offset               = 0;
my $offset_store         = 0;
my $offset_filename;

my $hash_log;
my $hash_db;

sub usage () {
    <<EOF;
usage: $0 [ options ] pazpar2.log ....

--debug=0..2      	debug option, default: $debug
--display-query-errors	print the user search query reponsible for the error, default: $display_query_errors
--display-step-errors	print JavaScript step error message, default: $display_step_errors
--display-time 		print time stamp of each error message, default: $display_time
--offset		start at offset in log file, default: $offset
--offset-filename=/path/to/offset.txt		keep offset of last run
--offset-store		select an offset-filename automatically, default: $offset_store

By default a total hit count of an error message is displayed. Use the option --display-time to
get a chronological error list.
EOF
}

sub read_offset {
    my $file = shift;

    if ( !-e $file ) {
        warn "offset file $file does not (yet) exists, skip\n" if $debug;
        return 0;
    }

    my $fh = IO::File->new( $file, "r" ) or die "open $file: $!\n";
    my $offset = 0;
    while (<$fh>) {
        chomp;
        $offset = $_;
        last;
    }
    $fh->close;
    return $offset;
}

sub write_offset {
    my $file = shift;
    my $offset = shift || 0;

    warn "open $file and write offset: $offset\n" if $debug;
    my $fh = IO::File->new( $file, "w" ) or die "open > $file: $!\n";
    print $fh $offset;
    $fh->close;
}

#

#
# IndexData pazpar2 log file parsing
#
sub parse_log {
    my %args = @_;

    my $offset          = $args{'offset'} || 0;
    my $hash_db         = $args{'hash_db'};
    my $files           = $args{'files'};
    my $offset_filename = $args{'offset_filename'};

    my @files = ref $files eq 'ARRAY' ? @$files : $files;
    push @files, 'STDIN' if scalar(@files) <= 0;

# first, parse a search query, extract the ID of the session
# and store the important information in $hash_log
#
# log example:
# 17:09:30-28/08 [log] - tcp:64.34.162.111:51734 51734 0.000000 Z3950 initRequest oa/oa36 - 81 Index Data PazPar2/ZOOM-C/YAZ 1.1.0/3.0.52 e687cb7eb87c841f0d1a374174d51d30371f2d97
# 14:34:54-22/02 [log] - tcp:93.220.79.199:17 17 0.000000 Z3950 searchRequest wikipedia 1 - RPN @attrset Bib-1 berlin
# 08:59:53-10/11 [log] - tcp:147.52.108.98:11609 11629 0.536000 Z3950 searchResponse Failure DIAG 114 21
#

    foreach my $logfile (@files) {

        my $fh;
        if ( $logfile eq 'STDIN' ) {
            $fh = \*STDIN;
            warn "read data from $fh\n" if $debug;
        }
        else {
            warn "open $logfile\n" if $debug;
            $fh = IO::File->new( $logfile, "r" ) or die "open $logfile: $!\n";
            if ( $offset > 0 || $offset_filename ) {
                if ( scalar(@files) == 1 ) {
                    $offset = read_offset($offset_filename) if $offset_filename;
                    my $st = stat($logfile);

                    if ( defined $st && $offset > 0 && $offset > $st->size ) {
                        warn "Offset $offset is larger than file size ",
                          $st->size, "for file $logfile, ignored.\n",
"Maybe you run a different file, or the file was log rotated?\n";
                    }
                    else {
                        warn "Start at offset: $offset\n" if $debug;
                        seek( $fh, $offset, 0 );
                        warn "Seek to end of file, no new data\n"
                          if $debug && $offset == $st->size;
                    }

                }

                else {
                    warn
                      "Ignore offset because more than one filename is given: ",
                      join( ", ", @files ), "\n";
                }
            }
        }

        my $sep = '@';
        while (<$fh>) {
            chomp;

            next if /^\s*$/;

            # 00:00:01-16/06 [log] - request id=11044 close=no
            next if / - request id=\d+ close=\w+$/;

           # 00:00:01-16/06 [log] - response id=11038 close=no duration=0.000435
            next if / - response id=\d+ close=\w+ duration=[\d\.]+$/;

            next if / HTTP_Request /;
            next if / HTTP_Response /;

            next if / Z3950 close \d+ lackOfActivity$/;
            next if / records to cache /;

            # no tabs please
            s/\t/ /g;

            #
            # new pazpar2 log format has a PID as second field
            # detect the old format automatically, and support both formats
            #
            my @line = split;
            next if scalar(@line) < 2;

            if ( $line[1] !~ /^\d+$/ ) {
                @line = ( $line[0], 1, @line[ 1 .. $#line ] );

                #warn join " ", @line, "\n";
            }

            my (
                $date,   $pid,    $log,   $dummy1, $ip,
                $id,     $time,   $proto, $type,   $database,
                $number, $dummy2, @query
            ) = @line;

            my $q;

            next if $date !~ /-/;

            # swap date and time, date first for sorting
            my $iso_date;
            {
                my ( $time2, $date2 ) = split( "-", $date );
                $iso_date = "$date2-$time2";

            }

            # skip "RPN @attrset Bib-1"
            if ( $type && $type eq 'searchRequest' && $#query >= 3 ) {
                shift @query;
                shift @query;
                shift @query;
                $q = join( " ", @query );
            }
            else {

                # XXX
                next if !defined $dummy2;

                $q = join " ", $number, $dummy2, @query;

            }

            # do not display credentials
            $database =~ s,([&?]password)=([^&]*),$1=XYZ,g;

            # got a new sesssion
            if ( $type && $type eq 'initRequest' ) {
                warn "Init: $type, $id, database: $database\n" if $debug;

                # do not store passwords: user/password
                my $user = $database;
                $user =~ s,/.*,,;
                $user .= ":$id" if $debug >= 2;

                $hash_log->{$id}{"user"} = $user;
            }

            # got a search
            elsif ( $type && $type eq 'searchRequest' ) {
                warn "Search: $type, $id, database: $database, query: $q\n"
                  if $debug;
                $hash_log->{$id}{"db"} = $database;
                $hash_log->{$id}{"q"}  = $q;

                my $user = $hash_log->{$id}{"user"} || "";
                my $name = $user . $sep . $database;
                $hash_db->{ALL}{$name}++;
                $hash_db->{ERR}{$name} += 0;    # init
            }

            # got a failure
            #
            # log example:
            # searchResponse Failure DIAG 3 Error in step 'Set for...'
            elsif ($type
                && $type     eq 'searchResponse'
                && $database eq 'Failure'
                && $dummy2 =~ /^\d+$/ )
            {
                if ( $debug >= 2 ) {
                    warn $hash_log->{$id}, "\n";
                }

                # remember database name and query
                my $user = $hash_log->{$id}{"user"} || "";
                my $name = $user . $sep . $hash_log->{$id}{"db"};
                $name .= "\t" . $q if $display_step_errors;
                $name .= "\t" . $hash_log->{$id}{"q"} if $display_query_errors;
                $hash_db->{ERR}{$name}++;
                $hash_db->{TIME}{$name} = $iso_date;
            }

            # got a failure
            #
            # log example:
            # presentResponse Failure DIAG 13 74
            elsif ($type
                && $type     eq 'presentResponse'
                && $database eq 'Failure'
                && $dummy2 =~ /^\d+$/ )
            {
                if ( $debug >= 2 ) {
                    warn $hash_log->{$id}, "\n";
                }

                # remember database name and query
                my $user = $hash_log->{$id}{"user"} || "";
                my $name = $user . $sep . $hash_log->{$id}{"db"};
                $name .= "\t" . "$type $database $q"
                  if $display_step_errors;
                $name .= "\t" . $hash_log->{$id}{"q"} if $display_query_errors;

                $hash_db->{ERR}{$name}++;
                $hash_db->{TIME}{$name} = $iso_date;
            }

        }

        # record offset for next run
        if ( $offset_filename && scalar(@files) == 1 ) {
            my $st = stat($logfile) or die "stat: $logfile\n";
            write_offset( $offset_filename, $st->size );
        }
    }

    return $hash_db;
}

sub csv_header {
    my $csv   = shift;
    my $names = shift;

    # column names
    my $status = $csv->combine(@$names);
    print $csv->string(), "\n";

    # no empty line
    return;

    # empty line
    my @data = map { undef } @$names;
    $status = $csv->combine(@data);
    print $csv->string(), "\n";
}

# generate an filename to keep the offset automatically
sub get_offset_filename {
    my $uid  = $<;
    my $file = "/var/tmp/ppstat-offset-$uid.txt";

    return $file;
}

######################################################################
#
# main
#

GetOptions(
    "help"                 => \$help,
    "display-step-errors"  => \$display_step_errors,
    "display-query-errors" => \$display_query_errors,
    "display-time"         => \$display_time,
    "debug=i"              => \$debug,
    "offset=i"             => \$offset,
    "offset-filename=s"    => \$offset_filename,
    "offset-store"         => \$offset_store,
) or die usage;

die &usage if $help;

if ($offset_store) {
    $offset_filename = get_offset_filename;
}

# log analyze
$hash_db = parse_log(
    'offset'          => $offset,
    'hash_db'         => $hash_db,
    'files'           => \@ARGV,
    'offset_filename' => $offset_filename
);

my $db                 = $hash_db->{ERR};
my $dbd                = $hash_db->{TIME};
my $display_details    = $display_step_errors || $display_query_errors;
my $display_csv_header = !( $offset || $offset_store || $offset_filename );

#print "Fail." . ( !$display_details ? "\tAll\t%" : "" ) . "\tconnector\n";
my $csv = Text::CSV->new(
    {
        quote_char          => '"',
        escape_char         => '"',
        sep_char            => ',',
        eol                 => '',
        binary              => 1,
        allow_loose_quotes  => 1,
        allow_loose_escapes => 1,
        allow_whitespace    => 1,
        blank_is_undef      => 1,
    }
);

# detailed errors
my $fail = $display_time ? 'Time' : 'Fail.';
if ($display_details) {
    my @names = ( $fail, 'connector URL', 'Diagnostic / step errors' );
    push( @names, 'Search Query' ) if $display_query_errors;

    csv_header( $csv, \@names ) if $display_csv_header;
    my @rows;
    if ($display_time) {
        @rows = sort { $dbd->{$a} cmp $dbd->{$b} || $a cmp $b } keys %$dbd;
    }
    else {
        @rows = reverse sort { $db->{$a} <=> $db->{$b} || $a cmp $b } keys %$db;
    }
    foreach my $key (@rows) {
        if ( $db->{$key} > 0 ) {

# display in the first row the date&time of failure, or how often the error occured
            my $first_column = $display_time ? $dbd->{$key} : $db->{$key};

            my $status = $csv->combine( $first_column, split( "\t", $key ) );
            die "CSV error: " . $csv->error_diag . "\n" if !$status;

            print $csv->string(), "\n";
        }
    }
}

# simple error count and percentage of failures
else {
    $db = $hash_db->{ALL};
    my @data;

    foreach my $key (
        reverse sort { $hash_db->{ERR}{$a} <=> $hash_db->{ERR}{$b} }
        keys %$db
      )
    {
        push(
            @data,
            [
                $hash_db->{ERR}{$key},
                $db->{$key},
                int(
                    100 * $hash_db->{ERR}{$key} / $hash_db->{ALL}{$key} + 0.5
                ),
                $key
            ]
        );
    }

    csv_header( $csv, [ $fail, 'All', '%', 'connector URL' ] );

    foreach my $l ( reverse sort { $a->[2] <=> $b->[2] || $a->[1] <=> $b->[1] }
        @data )
    {
        my $status = $csv->combine( $l->[0], $l->[1], $l->[2] . "%", $l->[3] );
        die "CSV error: " . $csv->error_diag . "\n" if !$status;

        print $csv->string(), "\n";

        #print $l->[0], "\t", $l->[1], "\t", $l->[2], "%\t", $l->[3], "\n";
    }
}

1;
