#!/usr/bin/perl

# cproxy
#
# Takes an incoming http request, adds cookies, authentication, etc
# to it, and passes it on to the target site. Assumes Apache's mod-
# proxy-html module with rewrite links etc in the page on its way out.
#
# Uses a session file from the ProxyUrl step. If one is not available,
# tries to check for a parameter file from Metaproxy, and if that is
# there, uses it to invoke a connector to create a session, and then
# proceeds to use that session.
#
# Take 2 - pass all redirects etc to the users browser
# Let it worry about cookies etc

use English;
use CGI ':standard';    # apt-get install perl-modules
use POSIX qw(strftime);
use File::stat;
use LWP::UserAgent;

use strict;
use warnings;

my $debug = 0;
$debug =  $ENV{'CPROXYDEBUG'} || $debug;

my $c = ""; # debug output
my $dumpfile = "";  # debug dumps
my $cgi = new CGI();


dbg("\n<h1>script works</h1>\n");

####### Configuration

#my $configfilename = "/etc/cf-proxy/cproxy.cfg";  
my $configfilename = $ENV{'CPROXYCONFIG'} || "/etc/cf-proxy/cproxy.cfg";
my %configs;
open(F,$configfilename)
  or configerrorpage($cgi, "Could not open config file '$configfilename': $!");
while ( <F> ) {
    chomp();
    s/\#.*$//;
    if ( /^ *([^ :]+) *: *(.*)$/ ) {
        $configs{$1}=$2;
        dbg("config '$1':'$2'");
    }
}

# This is what gets removed from the incoming url, together
# with the session number.
# It can be something like "hpxy.indexdata.com/XXX/node102"
my $fullproxyhost = $configs{'proxyhostname'};
my ( $proxyhost, $proxyprefix ) = split('/', $fullproxyhost, 2);
$c .= "config: proxyhost: '$proxyhost', prefix: '$proxyprefix' <br/>\n"
  if ($debug);

# Check the proxy prefix for session-id like numeric sequences
# These could lead to subtle bugs, so we just disallow them
if ( $proxyprefix =~ /(\/[0-9]+(\/|$))/ ) {
  configerrorpage($cgi, "proxyhostname may not contain a full-numerical ".
    "component '$1' - it will get confused with session-id" );
}


####### Main program
my $userAgent = $cgi->user_agent();
my $ua = new LWP::UserAgent;
$ua->agent($userAgent); 
$ua->requests_redirectable( [] ); # disable all redirects

my %headers = ();

$c .= dumpcgi($cgi) if ($debug>1);

my $sesdir = $configs{'sessiondir'};
configerrorpage($cgi,"No 'sessiondir' in config") unless ( $sesdir );

my ($url,$ses) = geturl($cgi);

$c .= "This is session $ses <br/>\n"
    if ($debug && $ses);
$c .= "Should get page '$url' <a href='$url'>link</a> <p/>\n"
    if ($debug && $url);

### Experiment: If no session, extract one from Referer-header, and
# redirect to a properly proxified URL (so we get the referer right
# on the links on that page
if ( !$ses ) {
  my $ref = $cgi->http("Referer");
  if ( $ref && $ref =~ /$proxyhost\/$proxyprefix\/([0-9]+)\/([^\/ ]+)/ ) {
    $ses = $1;
    my $host = $2;
    checkdumpfile($ses,$url);
    my $target = $url;
    $target =~ s/^https?:\/\///;
    my $location = "http://$fullproxyhost/$ses/$host/$target";
    dbg("No session in URL");
    dbg("Referer: $ref");
    dbg("Found session $ses and host $host in Referer");
    dbg("Redirecting to $location");
    #die ("Should redirect to session $ses u=$url l=$location");
    print $cgi->redirect($location);
    print "$c\n" if ($debug);
    exit 0;
  }
}
# End of Referer-experiment

requesterrorpage($cgi, "No cproxy session specified in the url<br/>\n")
    unless ( $ses );

my $req_method = $cgi->request_method();
dbg("Received a $req_method request");
if ( $req_method ne "GET" ){
  requesterrorpage($cgi, "Method $req_method not supported");
}

my $sesf = "$sesdir/cf.$ses";
my $sessiontimeout = 0.5; # in days!  TODO - Get from config

# Check if we have a session already. If not, try to see if we have a
# parameter file, and create session from there.
if ( (! -f $sesf) ||
     (-M $sesf > $sessiontimeout ) ) {
    my $paramfile = "$sesdir/cf.$ses.p";
    if ( -f $paramfile ) {
      startsession($cgi, $ses, $paramfile);
      internalerrorpage($cgi,"Creating session $ses failed ".
          "(issues with the content connector?)" ) unless ( -f $sesf );
      # TODO - Better error reporting within startsession
      # This should be a last fallback, and a sanity check
    }
}
$c .= "-f of $ses = " . (-f $sesf ) ."<br>\n" if ($debug);
requesterrorpage($cgi,"No session $ses")
    unless ( -f $sesf );
requesterrorpage($cgi,"Session has $ses expired" )
    unless ( -M $sesf < $sessiontimeout ); 


############# Read the session file
my $username="";
my $password="";
my $proxyip="";
my @sescookies;
my $referer="";

open (SF, $sesf);
my $targethost = $1 if ( $url =~ /https?:\/\/([^:?\/]+)/ );
while ( <SF> ) {
    chomp();
    my $line = $_;
#    $c .= "sess: $_ <br/>\n";
    if ( $line =~ /^Referer +(.*)$/ ) {
      $referer = $1;
    }
    if ( $line =~ /^Username (.+)$/ ) {
      $username = $1;
    }
    if ( $line =~ /^Password (.+)$/ ) {
      $password = $1;
    }
    if ( $line =~ /^Proxyip (.+)$/ ) {
      $proxyip = $1;
      if ( $proxyip !~ /^https?:\/\/.+/ ) {
        $proxyip = "http://$proxyip";
      }
    }
    if ( $line =~ /^Cookie +([^;]+) ?; *([^=]+)=(.*)$/ ) {
      my ( $dom, $key, $val ) = ( $1, $2, $3 );
      if ( $targethost =~ /$dom$/ ) {
        dbg("cookie from session: '$dom' '$key' '$val'") ;
        push @sescookies, "$key=$val";
      } else {
        dbg("Cookie domain '$dom' does not match '$targethost', " .
            "skipping '$key' '$val' ");
        #dbg("Passing the cookie anyway!");
        #push @sescookies, "$key=$val";
      }
    }
}



my $req = new HTTP::Request GET => $url;

# Copy headers over
my $requestcookie = "";
for my $hdr ( $cgi->http() ) {
    chomp($hdr);
    next unless( $hdr =~ /^HTTP/ ); # skip all that are not headers
    next if ( $hdr =~ /^HTTP_HOST/ ); 
    next if ( $hdr =~ /^HTTP_X_FORWARD/ ); # no need to talk about the proxying
    next if ( $hdr =~ /^HTTP_CONNECTION/ ); # don't mess with that
    $hdr =~ s/^HTTP_//;
    my $hdrval = $cgi->http($hdr);
    if ( $hdr =~ /Cookie/i ) {
      dbg("Cookie header from request: $hdr $hdrval");
      $requestcookie = $hdrval;
      next;
    }
    dbg("Extra header $hdr: $hdrval ");

    if ( $hdr =~ /Referer/i ) {  # de-proxify the referer header
      $hdrval =~ s/^http:\/\/$fullproxyhost\/([0-9]+)\//http:\/\// ;
      $referer = $hdrval;
      dbg("Deproxified the Referer header to $hdrval" );
      next;
      # override the one from session
    }
 
    $req->header( $hdr => $hdrval );
}

if ( $referer ) {  # Have one from cfsession or from request
  $req->header( "Referer" => $referer );
}


# Merge cookies from the request and the session
print DF "Request cookies: $requestcookie\n" if ($dumpfile);
$c .= "Request cookies: $requestcookie<br>\n" if ($debug);
for my $c ( reverse @sescookies ) {  # reverse, so latest wins
  my ($scname,$scval) = split('=',$c,2);
  if ( $requestcookie =~ /$scname=/ ) {
    dbg("session cookie $scname=$scval not set, already in request");
  } else {
    dbg("Appending session cookie $scname=$scval");
    $requestcookie .= "; " if ($requestcookie);
    $requestcookie .= "$scname=$scval";
  }
}
if ($requestcookie) {
  dbg("Final request cookies: $requestcookie");
  $req->header( "Cookie" => $requestcookie );
}


if ( $username || $password ) {  # (used to be &&, but that was wrong)
    dbg("Setting up authorization as $username:$password");
    $req->authorization_basic($username, $password);
}

if ( $proxyip ) {
    dbg("Setting up http proxy to $proxyip");
    $ua->proxy( ['http', 'https'], $proxyip );
}


$c .= "<p/>\nREQUEST === <br/>\n". $req->as_string("<br/>\n") if ($debug);
print DF "\n============ REQUEST ===========\n" . $req->as_string("\n")
  if ( $dumpfile );

#my $content="";
 
###########
# Pass the request to the server, and get a response
my $res = $ua->request($req);

#$c .= "<p/>\nRESPONSE === <br/>\n". $res->as_string("<br/>\n") if ($debug);

dbg("========= RESPONSE ===========");
#print DF "\n============ RESPONSE ===========\n" . $res->as_string("\n")
#  if ($dumpfile);
# Not much use, it is only in the way

####### Headers

# Check first for a redirect to https, we can't do that
# Needs to be done before sending 200-OK
if ( $res->code == 302 &&
     $res->header("Location") =~ /^https:/ ) {
  dbg("Not redirecting to " . $res->header("Location") );
  requesterrorpage($cgi,"Can not proxy HTTPS");
}

# Loop through the headers, fix what needs to be fixed,
# and output them immediately, without bothering with all the
# nice features that CGI and HTTP offer - they just get in the way.

if ( $debug && !$res->is_success ) {
  # The debug output is ok (as far as HTTP is concerned, otherwise
  # we might not see it.
  print "Status: 200 OK (but with errors)\n";
  dbg("Faking OK status, incoming was: " . $res->code. " ". $res->message );
} else {
  print "Status: " . $res->code. " ". $res->message . "\n";
  dbg("Response status " . $res->code. " ". $res->message );
}

# Procerss all headers
$res->scan( sub {
    my ($hdr, $val) = @_;
    dbg("Response header $hdr: $val");
    if ( $hdr =~ /Content-Type/i && $debug ) {
      dbg("  Faking content type to text/plain since we are debugging");
      # Easier to read on the browser.
      # If not debug, treat as any other header, pass through unchanged
      $val = "text/plain\n";
    }
    if ( $hdr =~ /Location/i ) {
      # a) absolute url http://foo.com/page/
      #     -> http://cproxy.id.com/prefix/9999/foo.com/page/
      $val =~ s/^(https?:\/\/)/$1$fullproxyhost\/$ses\//;
      # b) relative url /page/
      #     -> /prefix/9999/foo.com/page/   ###
      my $hostname = $1 if ( $url =~ ( /^https?:\/\/([^\/]+)/ ));
      $val =~ s/^\//\/$proxyprefix\/$ses\/$hostname\//;
      dbg("  Fixed Location to $val");
    }
    if ( $hdr =~ /Link/i ) {
      # Site-relative link </some/where...>
      $val =~ s/<\//<\/$proxyprefix\/$ses\//;
      # absolute link <http://site.com/some/where...>
      $val =~ s/(<https?:\/\/)/$1$fullproxyhost\/$ses\//;
      # Relative link <styles/foo.css> need not be fixed, they are still
      # relative to the current page, proxified or not.
      dbg("  Fixed link to $val");
      # TODO - Find a place that uses absolute links, and check they work!
    }
    if ( $hdr =~ /Set-Cookie/i ) {
      my $domain = $1 if ( $val =~ / domain=([^;]+)/i );
      if ( $domain ) {
        # Change domain to our proxy host address, always
        # Keep the "domain" word as it was, uppercased or not
        $val =~ s/(domain=)[^; ]+/$1$proxyhost/i;
      } else { # no domain set, append ours.
        $val .= ";Domain=$proxyhost";
      }
      my $path = $1 if ( $val =~ / path=([^; ]+)/i );
      # Path will always have the proxy prefix and session, to keep things
      # separate.
      my $newpath =  "/" . $proxyprefix. "/" . $ses . "/";
      # If the cookie domain was absolute, we can safely append it to the path
      # For relative domains (.indexdata.com), we can not, as they will not
      # match, and the browser will not send the cookie. The best we can do
      # is to leave the path to be session-global, and hope we don't meet
      # different conflicting wildcard cookies ( JSESSION in .content.site.com
      # and in .images.site.com).
      if ( $domain =~ /^[a-zA-Z]/ ) {
        $newpath .= $domain. "/";
      } else {
        # wildcard domain (.indexdata.com). Create an extra cookie, as if
        # it had been a absolute domain, just in case. Some sites set all
        # their cookies .subdomain.domain.com, even if they mean plain
        # subdomain.domain.com
        my $absdomain = $domain;
        if ( $absdomain =~ s/^\.// ) {
          my $altpath = $newpath . $absdomain . "/";
          my $altval = $val;
          $altval =~ s/(path=)[^; ]+/$1$altpath/i;
          dbg ("  Created extra cookie $altval");
        }
      }
      $val =~ s/(path=)[^; ]+/$1$newpath/i;
      dbg("  Fixed cookie to $val");
    }
    if ( $hdr =~ /Content-Length/ && $debug ) {
      dbg("  Skipping Content-Length header ($val), we have added debug content");
      return;
    }
    if ( $hdr =~ /Content-Base/i ) {  # 
      # absolute link http://site.com/some/where
      $val =~ s/(https?:\/\/)/$1$fullproxyhost\/$ses\//;
      dbg("  Fixed Content-Base header $val");
    }

    # Looks like Apache barfs on any kind of continuation lines,
    # so we join the whole header into one line. May give problems
    # if the line is over 8k long, not likely to happen.
    # (note that we may see a \r\n line delimiter, better remove the \r too.
    # \s matches any kind of whitespace)
    $val =~ s/\s*\n\s*/ /g;

    # Pass other headers through unmodified
    print "$hdr: $val\n";
  }
);

## Experiment to brute-force LexisNexis' javascript redirect
# ><body onload="javascript:window.parent.location='/uk/legal/returnTo.do?returnToKey=20_T15118578310';>
# Not needed after finding the cookie whitespace issue, bug 5961.
# Kept here as a reference anyway
my $hostname = $1 if ( $url =~ ( /^https?:\/\/([^\/]+)/ ));
my $content = $res->content;
dbg("=== Checking content ===");

#if ( $content =~ s/(javascript:window\.parent\.location='\/)/$1$proxyprefix\/$ses\/$hostname\// ) {
#  dbg("Brute-forced a Lexis javascript redirect");
#}
if ( $content =~ /^ *(.*window\.(parent\.)?location *= *['"]\/.*$)/m ) {
  dbg("Looks like some javascript redirect might need proxying: $1");
}
if ( $content =~ /(\<[^>]*javascript:.*['"]\/[^'">]*['"][^>]*\>)/ ) {
  dbg("Looks like some javascript link might need proxying: $1");
}

print "\n" ;
if ($c && $debug) {
  print "\n================ DEBUG ============= <br/>\n";
  print $c;
  print "\n================ CONTENT =========== <br/>\n";
}
#print $res->content ;
print $content ;
print DF "\n================ CONTENT =========== <br/>\n" . $content
  if ($dumpfile);
exit (0);

########
# Error handling

# Error message. (unlike earlier), the code only goes into the headers, and
# can contain both a number and a text, as in "404 Not Found".
# The message is displayed on the screen.
sub errormsg {
  my ( $cgi, $code, $msg ) = @_;
  $headers{'-status'} = $code;
  print $cgi->header(%headers), $msg;
  print "<p/>$c" if $debug;
  if ( $dumpfile) {
    print DF "\n====== RETURNING ERROR ========= \n";
    print DF $cgi->header(%headers), $msg
  }
  exit(0);  
}

# Makes a proper error page
sub errorpage {
  my ( $cgi, $code, $msg ) = @_;
  warn("$code  $msg"); # Get in apache error log too
  errormsg($cgi, $code,
    "<html><body>\n$msg\n</body></html>\n" );
}

# Makes a cproxy config error page
sub configerrorpage {
  my ( $cgi, $msg ) = @_;
  errorpage( $cgi, "500 Internal error - cproxy config",
    "<h1>500 Internal Error - cproxy config</h1>\n$msg" );
}
# Makes a "500 Internal Error"
sub internalerrorpage {
  my ( $cgi, $msg ) = @_;
  errorpage( $cgi, "500 Internal error",
    "<h1>500 Internal Error</h1>\n$msg" );
}

# Makes a "400 Bad request" page
sub requesterrorpage {
  my ( $cgi, $msg ) = @_;
  errorpage( $cgi, "400 Bad Request",
    "<h1>400 Bad Request</h1>\n$msg" );
}

# Debug routine
# Append message to $c (and from there to the screen) if $debug
# Print to dump file if $dumpfile
sub dbg {
  my $msg = shift;
  $c .= $msg . "<br/>\n" if ($debug);
  print DF $msg . "\n" if ($dumpfile);
}

#########

# Check if we have a directory /tmp/cf.XXXXX.dump, and if so,
# open a file for dumping all traffic there
sub checkdumpfile {
  my ( $ses, $url ) = @_;
  my $dumpdir = $sesdir . "/cf." . $ses . ".dump";
  $c .= "Checking dumpdir '$dumpdir' : " . ( (-d $dumpdir) || "0" ) . "<br/>\n"
    if $debug;
  if ( -d $dumpdir ) {
    my $fn = $url;
    $fn =~ s/^http://;
    $fn =~ s/\?.*//;  # drop parameters, way too long
    $fn =~ s/\//_/g;
    $fn =~ s/^_*//;
    $dumpfile = $dumpdir . "/" . $fn;
    if ( -f $dumpfile ) {
      my $suffix = 1;
      while ( -f $dumpfile . "." . $suffix ) {
        $suffix ++;
      }
      $dumpfile.= "." . $suffix;
    }
    open DF, ">$dumpfile"
      or internalerrorpage($cgi,"Could not open dump file '$dumpfile': $!");
    print DF "Cproxy dump " . `date +%c`;
    print DF "Session: $ses \nURL: $url \n";
  }
}

# Extract the target url and session id from the incoming url
sub geturl {
  my $cgi = shift;
  #my $s = $ENV{'SERVER_NAME'};
  my $s = $ENV{'HTTP_X_FORWARDED_HOST'};
  my $u;
  my $ses="";
  my $r = $ENV{'REQUEST_URI'}; # /www.some.tld/...

  # If the request is in the form
  #    hpxy.indexdata.com/prefix/17/cproxydebug/www.indexdata.com
  # the remove the cproxydebug, and set debug flag. That way, we can force debug
  # output when ever we want.
  if ( $r =~ s/\/cproxydebug\//\// ) {
    $debug = 1;
  }
  $url = "http:/" . $r; # http://www.some.tld/...
  $c .= "Looking at <br/>\n url='$url'<br/>\n s='$s'<br/>\n r='$r'<br/>\n"
     if ($debug>1);
  if ( $s =~ /^([^.]+)\.$proxyhost/ ) # old form 17.cproxy.indexdata.com/...
  { 
    $ses = $1;
  } elsif ( $r =~ /^\/$proxyprefix\/([0-9]+)\// ) { #cproxy.id.com/prefix/9999/...
    $ses = $1;
    $url =~ s/\/$proxyprefix\/$ses//;
  } else {
    $c .= "ERROR: Could not get session from '$s' and '$r', \n  expected '$fullproxyhost'<br/>\n"
      if $debug;
  }
  checkdumpfile($ses,$url);
  return ($url,$ses);

}


##########
# Try to create a session by connecting to the cf-engine
# Die on errors, return only on success
sub startsession {
    my $cgi = shift;
    my $ses = shift;
    my $paramfile = shift;
    $c .= "startsession: ses = $ses, p='$paramfile' <br/>\n" if($debug);
    if ( ! -f $paramfile ) {
        requesterrorpage($cgi,"Bad session $ses, no parameters found");
    }
    open(F,$paramfile)
        or internalerrorpage($cgi,"Could not read session parameters $ses");
    my $username = "";
    my $password = "";
    my $connector = "";
    my $proxy = "";
    my $realm = "";
    while ( <F> ) {
        chomp();
        s/\#.*$//; # skip comments
        if ( /^ *auth *: *([^\/]+)\/?(.*)$/ ) {
            $username = $1;
            $password = $2;
        }
        if ( /^ *connector *: *(.*)$/ ) {
            $connector = $1;
        }
        if ( /^ *proxy *: *(.*)$/ ) {
            $proxy= $1;
        }
        if ( /^ *realm *: *(.*)$/ ) {
            $realm = $1;
        }
    }
    if ( ! $connector ) {
      internalerrorpage($cgi, "Bad session param file, no connector specified");
      #die("Bad session param file $paramfile, no connector specifiedi\n");
    }
    my $cfengine = $configs{'cfengine'};
    if ( ! $cfengine ) {
        configerrorpage($cgi,"No 'cfengine' specified in config file, " .
             "can not create session\n");
    }
    # TODO - The params should be url-encoded, just to be sure we can handle
    # a password with a '&' in it.
    my $sru = "http://$cfengine/$connector";
    $sru .= ",cproxysession=$ses";
    $sru .= "&realm=$realm" if ($realm);
    $sru .= "&user=$username" if ($username);
    $sru .= "&password=$password" if ($password);
    $sru .= "&proxy=$proxy" if ($proxy);
    $sru .= "?version=1.2";
    $sru .= "&operation=searchRetrieve";
    $sru .= "&x-pquery=dummy";
    $sru .= "&maximumRecords=0";

    $c .= "Getting $sru <br/>\n" if ($debug);

    my $userAgent = $cgi->user_agent();
    my $ua = new LWP::UserAgent;
    $ua->agent($userAgent);
    my $srureq = new HTTP::Request GET => $sru;
    print DF "\n============ SRU REQUEST ===========\n" . $srureq->as_string("\n")
      if ( $dumpfile );
    my $srures = $ua->request($srureq);
    print DF "\n============ SRU RESPONSE ===========\n" . $srures->as_string("\n")
      if ( $dumpfile );
    
    if ($srures->is_success) {
        $c .= "Got response ok <br/>\n" if($debug);
        ## TODO - Check for error return from cf
        if ( $srures->content =~ /<diag:diagnostic>(.*)<\/diag:diagnostic>/ ) {
          my $diag = $1;
          $diag =~ s/&/&amp;/g;  # poor mans encoding
          $diag =~ s/</&lt;/g;  
          internalerrorpage($cgi, "Error with content connector <br/>\n" .
             $diag );
        }
    } else {
        $c .= "Session-get FAILED <br/>\n" if($debug);
        $c .= "$srures->message <br/>\n" if($debug);
        warn("Error creating session! $ses: " . $srures->code);
        warn($srures->content );
        errormsg($cgi, $srures->code,
            "Error creating session! <br/>\n " . $srures->content );
    }
} # startsession

#########

sub dumpcgi {
    my ($cgi) = @_;
    
    my $c .= "Env: <br/>";
    foreach ( keys(%ENV) ) {
        $c .= "'$_' : '" . $ENV{$_} . "'<br/>\n";
    }
    $c .= "<p/>\n";
    
    $c .= "Cookies: <br/>";
    foreach ( $cgi->cookie() ) {
        $c .= "'$_' : '" . $cgi->cookie($_) . "'<br/>\n";
    }
    $c .= "<p/>\n";
    return $c;
}

########
sub dumpredirect {
  my ($resp) = @_;
  return 0 unless ($resp );
  my $i = dumpredirect( $resp->previous );
  $i++;
  print DF "=========== REDIRECT REQUEST $i ========= \n";
  print DF $resp->request->as_string . "\n";
  print DF "=========== REDIRECT RESPONSE $i ========= \n";
  print DF $resp->as_string . "\n";
  return $i;
}

# Editor trickery
# vim: shiftwidth=4 tabstop=8 expandtab 
