#!/usr/bin/perl

# cproxy
#
# Takes an incoming http request, adds cookies, authentication, etc
# to it, and passes it on to the target site. Assumes Apache's mod-
# proxy-html module with rewrite links etc in the page on its way out.
#
# Uses a session file from the ProxyUrl step. If one is not available,
# tries to check for a parameter file from Metaproxy, and if that is
# there, uses it to invoke a connector to create a session, and then
# proceeds to use that session.
#
# The philosophy is to pass things through with as little change as
# possible. Most of the html will be proxified by apache's mod_proxy_html,
# but we need to a little trickery here and there.
#
use English;
use CGI ':standard';    # apt-get install perl-modules
use POSIX qw(strftime);
use File::stat;
use LWP::UserAgent;
use Encode; # for utf-8 trickery, fixing bad encoding

use strict;
use warnings;

##################
#Global variables
##################

# yes, some things are easiest to keep globally available. Many should be
# factored away

# Debug flag, can be set in various ways
my $debug = 0;
$debug =  $ENV{'CPROXYDEBUG'} || $debug;

my $c = ""; # debug output
my $dumpfile = "";  # debug dumps

my $cgi; # Handle to apache interface

# The following could be moved out of the global scope, but not too easily TODO
my $fullproxyhost;  # hpxy.indexdata.com/XXX/node102
my $proxyhost;      # hpxy.indexdata.com
my $proxyprefix;    # XXX/node102

my $url;  # The url we want
my $ses;  # session number


my $username=""; # Stuff from the session file TODO Move into some kind of structure
my $password="";
my $proxyip="";
my $referer="";
my @sescookies;



######################
# Main program
######################
main() ;

sub main {
  # as a sub, so it can have local variables without polluting the global name 
  # space.
  my $ua = init();  # $ua us the user-agent for making HTTP requests with
  my %configs = readconfigfile();
  my $sesdir = $configs{'sessiondir'};
  $c .= dumpcgi($cgi) if ($debug>1);

  ($url,$ses) = parseurl($cgi,$sesdir);
  checkdumpfile($ses, $url, $sesdir );

  getsession($sesdir, $ses, $configs{'cfengine'});

  my $req = makerequest($cgi, $referer, $ua);

  my $res = httpget($req, $ua);

  checkstatus($res);
  my $charsetsave = responseheaders($res);
  content($res, $charsetsave );
  exit (0);

} # sub main


###################
#  Housekeeping
###################

####### Some initialization
sub init {
  # Make sure STDOUT is not encoding anything.
  binmode(STDOUT, ":raw");
  $cgi = new CGI(); # Input from apache, etc

  my $userAgent = $cgi->user_agent(); 
  my $ua = new LWP::UserAgent; # for making HTTP requests
  $ua->agent($userAgent);
  $ua->requests_redirectable( [] ); # disable all redirects
  $ua->parse_head(0); # Do not look in html/head section for meta-equiv stuff

  # Output the first debug, to see we are running at all.
  dbg("\n<h1>script works</h1>\n");
  return $ua;
} # init

####### Configuration
sub readconfigfile {
  my $configfilename = $ENV{'CPROXYCONFIG'} || "/etc/cf-proxy/cproxy.cfg";
  my %configs; 
  open(F,$configfilename)
    or configerrorpage($cgi, "Could not open config file '$configfilename': $!");
  while ( <F> ) {
      chomp();
      s/\#.*$//;
      if ( /^ *([^ :]+) *: *(.*)$/ ) {
          $configs{$1}=$2;
          dbg("config '$1':'$2'");
      }
  }

  # This is what gets removed from the incoming url, together
  # with the session number.
  # It can be something like "hpxy.indexdata.com/XXX/node102"
  $fullproxyhost = $configs{'proxyhostname'};
  ( $proxyhost, $proxyprefix ) = split('/', $fullproxyhost, 2);
  $c .= "config: proxyhost: '$proxyhost', prefix: '$proxyprefix' <br/>\n"
    if ($debug);

  # Check the proxy prefix for session-id like numeric sequences
  # These could lead to subtle bugs, so we just disallow them
  if ( $proxyprefix =~ /(\/[0-9]+(\/|$))/ ) {
    configerrorpage($cgi, "proxyhostname may not contain a full-numerical ".
      "component '$1' - it will get confused with session-id" );
  }

  configerrorpage($cgi,"No 'sessiondir' in config")
     unless ( $configs{'sessiondir'} );

  return %configs;
} # readconfigfile

# Extract the target url and session id from the incoming url
sub parseurl {
  my $cgi = shift;
  my $sesdir = shift;
  #my $s = $ENV{'SERVER_NAME'};
  my $s = $ENV{'HTTP_X_FORWARDED_HOST'};
  my $u;
  my $ses="";
  my $r = $ENV{'REQUEST_URI'}; # /www.some.tld/...

  my $req_method = $cgi->request_method() || "GET";
  dbg("Received a $req_method request");
  if ( $req_method ne "GET" ){
    requesterrorpage($cgi, "Method $req_method not supported");
  }

  # If the request is in the form
  #    hpxy.indexdata.com/prefix/17/cproxydebug/www.indexdata.com
  # the remove the cproxydebug, and set debug flag. That way, we can force debug
  # output when ever we want.
  if ( $r =~ s/\/cproxydebug\//\// ) {
    $debug = 1;
  }
  my $url = "http:/" . $r; # http://www.some.tld/...
  $c .= "Looking at <br/>\n url='$url'<br/>\n s='$s'<br/>\n r='$r'<br/>\n"
     if ($debug>1);
  if ( $s =~ /^([^.]+)\.$proxyhost/ ) # old form 17.cproxy.indexdata.com/...
  {
    $ses = $1;
  } elsif ( $r =~ /^\/$proxyprefix\/([0-9]+)\// ) { #cproxy.id.com/prefix/9999/...
    $ses = $1;
    $url =~ s/\/$proxyprefix\/$ses//;
  } else {
    $c .= "ERROR: Could not get session from '$s' and '$r', \n  expected '$fullproxyhost'<br/>\n"
      if $debug;
  }
  $c .= "This is session $ses <br/>\n"
      if ($debug && $ses);
  $c .= "Should get page '$url' <a href='$url'>link</a> <p/>\n"
      if ($debug && $url);

  # If no session, try to extract one from Referer-header
  # This actually redirects to a new URL with proper session in place,
  # and never returns, if it finds the session. Otherwise it is a no-op,
  # and we fail a bit later.
  if ( !$ses ) {
    getreferersession($url,$sesdir);
  }
  # TODO - Keep a cookie with the cfproxy session as well, and check from it too
  # Another last resort, probably won't help much more.
  
  requesterrorpage($cgi, "No cproxy session specified in the url<br/>\n")
      unless ( $ses );

  return ($url,$ses);
} # parseurl



#################
# Session stuff
#################


sub getsession {
  my $sesdir = shift;
  my $ses = shift;
  my $cfengine = shift; # for startsession

  my $sesf = "$sesdir/cf.$ses";
  my $sessiontimeout = 0.5; # in days!  TODO - Get from config

  # Check if we have a session already. If not, try to see if we have a
  # parameter file, and create session from there.
  if ( (! -f $sesf) ||
      (-M $sesf > $sessiontimeout ) ) {
      my $paramfile = "$sesdir/cf.$ses.p";
      if ( -f $paramfile ) {
        startsession($cgi, $ses, $paramfile, $cfengine);
        internalerrorpage($cgi,"Creating session $ses failed ".
            "(issues with the content connector?)" ) unless ( -f $sesf );
        # TODO - Better error reporting within startsession
        # This should be a last fallback, and a sanity check
      }
  }
  $c .= "-f of $ses = " . (-f $sesf ) ."<br>\n" if ($debug);
  requesterrorpage($cgi,"No session $ses")
      unless ( -f $sesf );
  requesterrorpage($cgi,"Session has $ses expired" )
      unless ( -M $sesf < $sessiontimeout );

  readsessionfile( $sesf );
} # getsession

sub readsessionfile {
  my $sesf = shift;
  open (SF, $sesf);
  my $targethost = $1 if ( $url =~ /https?:\/\/([^:?\/]+)/ );
  while ( <SF> ) {
      chomp();
      my $line = $_;
  #    $c .= "sess: $_ <br/>\n";
      if ( $line =~ /^Referer +(.*)$/ ) {
        $referer = $1;
      }
      if ( $line =~ /^Username (.+)$/ ) {
        $username = $1;
      }
      if ( $line =~ /^Password (.+)$/ ) {
        $password = $1;
      }
      if ( $line =~ /^Proxyip (.+)$/ ) {
        $proxyip = $1;
        if ( $proxyip !~ /^https?:\/\/.+/ ) {
          $proxyip = "http://$proxyip";
        }
      }
      if ( $line =~ /^Cookie +([^;]+) ?; *([^=]+)=(.*)$/ ) {
        my ( $dom, $key, $val ) = ( $1, $2, $3 );
        if ( $targethost =~ /$dom$/ ) {
          dbg("cookie from session: '$dom' '$key' '$val'") ;
          push @sescookies, "$key=$val";
        } else {
          dbg("Cookie domain '$dom' does not match '$targethost', " .
              "skipping '$key' '$val' ");
          #dbg("Passing the cookie anyway!");
          #push @sescookies, "$key=$val";
        }
      }
  }
} # readsessionfile

# A last-ditch attempt to get the session from the referer header. If found,
# force a redirect to a properly proxified URL (so we get the referer headers
# right for the links on that page)
# Some javascripts build a URL to /some/page, which the browser expands
# to cproxy.id.com/some/page. We need to get the proxyhost, proxyprefix, and
# the original hostname from the referer, build a proxified URL, and redirect
# there.
sub getreferersession {
    my $url = shift;
    my $sesdir = shift;
    my $ref = $cgi->http("Referer");
    dbg("Got referer '$ref' ph='$proxyhost' pp='$proxyprefix' url='$url'");
    if ( $ref && $ref =~ /$proxyhost\/$proxyprefix\/([0-9]+)\/([^\/ ]+)/ ) {
      $ses = $1;
      my $host = $2;
      checkdumpfile($ses, $url, $sesdir );
      my $target = $url;
      $target =~ s/^https?:\/\///;
      my $location = "http://$fullproxyhost/$ses/$host/$target";
      dbg("No session in URL");
      dbg("Referer: $ref");
      dbg("Found session $ses and host $host in Referer");
      dbg("Redirecting to $location");
      #die ("Should redirect to session $ses u=$url l=$location");
      print $cgi->redirect($location);
      print "$c\n" if ($debug);
      exit 0;
    }
} # getreferersession

##########
# Try to create a session by connecting to the cf-engine
# Die on errors, return only on success
# This is used on SRU connectors when Metaproxy sets up a .p file
# with the content connector info.
sub startsession {
    my $cgi = shift;
    my $ses = shift;
    my $paramfile = shift;
    my $cfengine = shift;
    $c .= "startsession: ses = $ses, p='$paramfile' <br/>\n" if($debug);
    if ( ! -f $paramfile ) {
        requesterrorpage($cgi,"Bad session $ses, no parameters found");
    }
    open(F,$paramfile)
        or internalerrorpage($cgi,"Could not read session parameters $ses");
    my $username = "";
    my $password = "";
    my $connector = "";
    my $proxy = "";
    my $realm = "";
    while ( <F> ) {
        chomp();
        s/\#.*$//; # skip comments
        if ( /^ *auth *: *([^\/]+)\/?(.*)$/ ) {
            $username = $1;
            $password = $2;
        }
        if ( /^ *connector *: *(.*)$/ ) {
            $connector = $1;
        }
        if ( /^ *proxy *: *(.*)$/ ) {
            $proxy= $1;
        }
        if ( /^ *realm *: *(.*)$/ ) {
            $realm = $1;
        }
    }
    if ( ! $connector ) {
      internalerrorpage($cgi, "Bad session param file, no connector specified");
      #die("Bad session param file $paramfile, no connector specifiedi\n");
    }
    #my $cfengine = $configs{'cfengine'};
    if ( ! $cfengine ) {
        configerrorpage($cgi,"No 'cfengine' specified in config file, " .
             "can not create session\n");
    }
    # TODO - The params should be url-encoded, just to be sure we can handle
    # a password with a '&' in it.
    my $sru = "http://$cfengine/$connector";
    $sru .= ",cproxysession=$ses";
    $sru .= "&realm=$realm" if ($realm);
    $sru .= "&user=$username" if ($username);
    $sru .= "&password=$password" if ($password);
    $sru .= "&proxy=$proxy" if ($proxy);
    $sru .= "?version=1.2";
    $sru .= "&operation=searchRetrieve";
    $sru .= "&x-pquery=dummy";
    $sru .= "&maximumRecords=0";

    $c .= "Getting $sru <br/>\n" if ($debug);

    my $userAgent = $cgi->user_agent();
    my $ua = new LWP::UserAgent;
    $ua->agent($userAgent);
    my $srureq = new HTTP::Request GET => $sru;
    print DF "\n============ SRU REQUEST ===========\n" . $srureq->as_string("\n")
      if ( $dumpfile );
    my $srures = $ua->request($srureq);
    print DF "\n============ SRU RESPONSE ===========\n" . $srures->as_string("\n"). "\n"
      if ( $dumpfile );

    if ($srures->is_success) {
        $c .= "Got response ok <br/>\n" if($debug);
        ## TODO - Check for error return from cf
        if ( $srures->content =~ /<diag:diagnostic>(.*)<\/diag:diagnostic>/ ) {
          my $diag = $1;
          $diag =~ s/&/&amp;/g;  # poor mans encoding
          $diag =~ s/</&lt;/g;
          internalerrorpage($cgi, "Error with content connector <br/>\n" .
             $diag );
        }
    } else {
        $c .= "Session-get FAILED <br/>\n" if($debug);
        $c .= "$srures->message <br/>\n" if($debug);
        warn("Error creating session! $ses: " . $srures->code);
        warn($srures->content );
        errormsg($cgi, $srures->code,
            "Error creating session! <br/>\n " . $srures->content );
    }
} # startsession

################
# Create a HTTP request for the website

sub makerequest {
  my $cgi = shift;
  my $referer = shift; # the referer header from the session, if any
  my $ua = shift;

  my $req = new HTTP::Request GET => $url;

  # Copy headers over
  my $requestcookie = "";
  for my $hdr ( $cgi->http() ) {
      chomp($hdr);
      next unless( $hdr =~ /^HTTP/ ); # skip all that are not headers
      next if ( $hdr =~ /^HTTP_HOST/ );
      next if ( $hdr =~ /^HTTP_X_FORWARD/ ); # no need to talk about the proxying
      next if ( $hdr =~ /^HTTP_CONNECTION/ ); # don't mess with that
      $hdr =~ s/^HTTP_//;
      my $hdrval = $cgi->http($hdr);
      if ( $hdr =~ /Cookie/i ) {
        dbg("Cookie header from request: $hdr $hdrval");
        $requestcookie = $hdrval;
        next;
      }
      dbg("Extra header $hdr: $hdrval ");

      if ( $hdr =~ /Referer/i ) {  # de-proxify the referer header
        $hdrval =~ s/^http:\/\/$fullproxyhost\/([0-9]+)\//http:\/\// ;
        $referer = $hdrval;
        dbg("Deproxified the Referer header to $hdrval" );
        next;
        # override the one from session
      }

      $req->header( $hdr => $hdrval );
  }

  if ( $referer ) {  # Have one from cfsession or from request
    $req->header( "Referer" => $referer );
  }


  # Merge cookies from the request and the session
  print DF "Request cookies: $requestcookie\n" if ($dumpfile);
  $c .= "Request cookies: $requestcookie<br>\n" if ($debug);
  for my $c ( reverse @sescookies ) {  # reverse, so latest wins
    my ($scname,$scval) = split('=',$c,2);
    if ( $requestcookie =~ /$scname=/ ) {
      dbg("session cookie $scname=$scval not set, already in request");
    } else {
      dbg("Appending session cookie $scname=$scval");
      $requestcookie .= "; " if ($requestcookie);
      $requestcookie .= "$scname=$scval";
    }
  }
  if ($requestcookie) {
    dbg("Final request cookies: $requestcookie");
    $req->header( "Cookie" => $requestcookie );
  }


  if ( $username || $password ) {  # (used to be &&, but that was wrong)
      dbg("Setting up authorization as $username:$password");
      $req->authorization_basic($username, $password);
  }

  if ( $proxyip ) {
      dbg("Setting up http proxy to $proxyip");
      $ua->proxy( ['http', 'https'], $proxyip );
  }
  return $req;
} # makerequest

#####################
# Get a response from the website

sub httpget {
  my $req = shift;
  my $ua = shift;

  $c .= "<p/>\nREQUEST === <br/>\n". $req->as_string("<br/>\n") if ($debug);
  print DF "\n============ REQUEST ===========\n" . $req->as_string("\n")
    if ( $dumpfile );

  # Pass the request to the server, and get a response
  my $res = $ua->request($req);

  return $res;
} # httpget


##################
# Check the status of the HTTP resonse
# Prints a status code for Apache and the browser

sub checkstatus {
  my $res = shift;

  dbg("========= RESPONSE ===========");

  # Check first for a redirect to https, we can't do that
  # Needs to be done before sending 200-OK
  if ( $res->code == 302 &&
      $res->header("Location") =~ /^https:/ ) {
    dbg("Not redirecting to " . $res->header("Location") );
    requesterrorpage($cgi,"Can not proxy HTTPS");
  }

  # Loop through the headers, fix what needs to be fixed,
  # and output them immediately, without bothering with all the
  # nice features that CGI and HTTP offer - they just get in the way.

  if ( $debug && !$res->is_success ) {
    # The debug output is ok (as far as HTTP is concerned, otherwise
    # we might not see it.
    print "Status: 200 OK (but with errors)\n";
    dbg("Faking OK status, incoming was: " . $res->code. " ". $res->message );
  } else {
    print "Status: " . $res->code. " ". $res->message . "\n";
    dbg("Response status " . $res->code. " ". $res->message );
  }
} # checkstatus

##################
# Process the headers of the HTTP response.
# Output relevant headers for apache and the browser
# TODO This needs to be refactored
sub responseheaders {
  my $res = shift;
  # Process all headers
  # Content type logic:
  # Some sites output two content-type headers, one with a charset and one without.
  # The browsers are clever enough to handle this, but mod_proxy_html believed in
  # the first one it sees, and defaults to iso-8859-1 if nothing specified. This
  # goes wrong if the site is actually sending utf-8, we doubly encoded chars.
  # See bug CP-3376: Cproxy charset issues
  my $contenttypedone = 0;
  my $contenttypesave = "";
  # ...done: 0=not seen yet. 1=seen and printed. 2=seen and saved.
  # If we meet one with a character set, we print it out in its right place
  # If we meet one without, we remember it for later. If, at the end, we have
  # not printed any good header, we print the saved one.
  my $charsetsave = ""; # Remember content-type for charset conversion tricks.

  $res->scan( sub {
      # This is a callback, called for each header. So we can return in the middle
      my ($hdr, $val) = @_;
      dbg("Response header $hdr: $val");
      if ( $hdr =~ /Content-Type/i ) {
        $charsetsave .= $val; # remember it (them all) for later
        if ( $debug && 0 ) {
          dbg("  Faking content type to text/plain since we are debugging");
          # Easier to read on the browser.
          # If not debug, treat as any other header, pass through unchanged
          $val = "text/plain\n";
          # Note: Do this for each content-type header, so we see what we receive!
        } else { # not debugging
          if ( $contenttypedone == 1 ) {
            dbg("  Skipping content type, already done ");
            return;
          }
          if ( $val =~ /charset/i ) { # a good content-type
            $contenttypedone = 1;
            dbg("  Using this content type, it has a charset");
          } else { # a bad content type, save for the last resort
            $contenttypedone = 2;
            $contenttypesave = $val;
            dbg("  Remembering this content type for later, in case nothing better comes up");
            return;
          }
        }
      } # content-type
      if ( $hdr =~ /Location/i ) {
        # a) absolute url http://foo.com/page/
        #     -> http://cproxy.id.com/prefix/9999/foo.com/page/
        $val =~ s/^(https?:\/\/)/$1$fullproxyhost\/$ses\//;
        # b) relative url /page/
        #     -> /prefix/9999/foo.com/page/   ###
        my $hostname = $1 if ( $url =~ ( /^https?:\/\/([^\/]+)/ ));
        $val =~ s/^\//\/$proxyprefix\/$ses\/$hostname\//;
        dbg("  Fixed Location to $val");
      }
      if ( $hdr =~ /Link/i ) {
        # Site-relative link </some/where...>
        $val =~ s/<\//<\/$proxyprefix\/$ses\//;
        # absolute link <http://site.com/some/where...>
        $val =~ s/(<https?:\/\/)/$1$fullproxyhost\/$ses\//;
        # Relative link <styles/foo.css> need not be fixed, they are still
        # relative to the current page, proxified or not.
        dbg("  Fixed link to $val");
        # TODO - Find a place that uses absolute links, and check they work!
      }
      if ( $hdr =~ /Set-Cookie/i ) {
        #  Get the cookie paths right. See CP-3416, and a long comment at the
        # end of the file.
        my $origdomain = "";
        $origdomain = $1 if ( $val =~ /[ ;]domain=([^;]+)/i );
        my $origpath = "";
        $origpath = $1 if ( $val =~ /[ ;]path=([^; ]+)/i );
        $val =~ s/[; ]*$//; # clean line end for appending
        if ( !$origdomain ) {
          $origdomain = ""; 
          if ($url =~ /https?:\/\/([^:?\/]+)[:?\/]/ ) {
            $origdomain = $1; # current domain, 'www.indexdata.com'
          }
          $val .= "; domain=$origdomain"; # will get replaced later
        }
        if ( !$origpath ) {
          $origpath = "";
          if ( $url =~ /https?:\/\/[^\/\?]+(\/[^?]+)/ ) {
            $origpath = $1; # '/foo/bar'
          }
          $val .= "; path=$origpath"; # will be replaced later
        }
        my $newdomain = $proxyhost;
        my $proxypart =  "/" . $proxyprefix. "/" . $ses . "/";
        my $domainpart = $origdomain ;
        my $altdomainpart = ""; # for an additional cookie header
        if ( $domainpart =~ /^\.(.*)$/ ) {   # wildcard '.indexdata.com'
          $domainpart = $1; # skip the dot, get just 'indexdata.com'
          if ($url =~ /https?:\/\/([^:?\/]+)\.$domainpart/ ) { # match the 'www'
            #$altdomainpart = "$1.$domainpart";  # 'www.indexdata.com'
            $domainpart = "$1.$domainpart";  # 'www.indexdata.com'
            $altdomainpart = "YES"; # just a signal
          }
        }
        my $pathpart = $origpath ;
        #dbg("  New cookie check: '$val'" );
        #dbg("    origdomain='$origdomain' origpath='$origpath' ");
        #dbg("    newdomain='$newdomain' proxypart='$proxypart' ");
        #dbg("    domainpart='$domainpart' alt='$altdomainpart' pathpart='$pathpart'");
        $val =~ s/(domain=)[^ ;]+/$1$newdomain/i;
        $val =~ s/(path=)[^ ;]+/$1$proxypart$domainpart$pathpart/i;
        print "$hdr: $val\r\n";
        dbg("  Fixed cookie to:  $hdr: $val");
        if ($altdomainpart) {
          $val =~ s/(path=)[^ ;]+/$1$proxypart/i;
          print "$hdr: $val\r\n";
          dbg("  Add extra cookie: $hdr: $val");
        }
        return; # from the callback, loop on to the next header
      } # cookie
      if ( $hdr =~ /Content-Length/ && $debug ) {
        dbg("  Skipping Content-Length header ($val), we have added debug content");
        return;
      }
      if ( $hdr =~ /Content-Base/i ) {  #
        # absolute link http://site.com/some/where
        $val =~ s/(https?:\/\/)/$1$fullproxyhost\/$ses\//;
        dbg("  Fixed Content-Base header $val");
      }
      # Looks like Apache barfs on any kind of continuation lines,
      # so we join the whole header into one line. May give problems
      # if the line is over 8k long, not likely to happen.
      # (note that we may see a \r\n line delimiter, better remove the \r too.
      # \s matches any kind of whitespace)
      $val =~ s/\s*\n\s*/ /g;

      # Pass other headers through unmodified
      print "$hdr: $val\r\n";
    } # header callback sub
  );

  # Last ditch attempt to get a content type with a character set
  # (note: if we don't have a content-type at all, do not assume html, it could
  # be plain text, js, image, or other stuff not to be proxified!). In practice,
  # anything that has a meta tag is fair game.
  if ( $contenttypedone == 2 ) {
    if ( $res->content =~
        /(<meta [^>]*http-equiv="Content-Type"[^>]+content=["']([^>"']+)['"][^>]*>)/ ) {
      dbg("No charset in content type, but found a meta tag");
      dbg("  $1");
      $contenttypesave = $2;
    } else {
      dbg("Using a saved (bad) content-type, since nothing better came up");
    }
    print "Content-Type: $contenttypesave\r\n";
    dbg("Response header Content-Type: $contenttypesave");
  }
  return $charsetsave;

} # responseheaders


# Check the content
sub content {
  my $res = shift;
  my $charsetsave = shift;

  my $hostname = $1 if ( $url =~ ( /^https?:\/\/([^\/]+)/ ));
  my $content = $res->content;
  dbg("=== Checking content ===");

  # Character set trickery
  my $charset = ""; # Get the (first) charset mentioned in content-type header(s)
  $charset = $1 if ( $charsetsave =~ /charset=(\S+)/ );
  #dbg("Charset '$charset' is_utf='" . utf8::is_utf8($content) . "' " .
  #    "is_valid='" . utf8::valid($content). "'" );

  # Fix broken utf-8 encodings
  if (  ( $charsetsave =~ /html/i ) && ( $charsetsave =~ /utf-?8/i )) {
    # Looks like utf-8 encoded html. Decode and re-encode it as utf-8
    # This should be a no-op, but if there are invalid byte sequences, this
    # will hide them from further processing in mod_proxy_html. Bug CP-3420
    my $origlen = length($content);
    my $check = Encode::FB_DEFAULT;  # Transform bad bytes into 0xFFFD
    # Encode::FB_CROAK;  # Die on the spot
    # Encode::FB_WARN;   # Stop reading at the bad char, write a warning
    # Encode::FB_PERLQQ; # Translate to something like \C5
    #print DF "== Content before charset trickery \n$content\n" if ($dumpfile);
    my $contentstring = decode ("utf8", $content, $check );
    #print DF "== Content after decode \n$contentstring\n" if ($dumpfile);
    $content = encode ( "utf8", $contentstring, $check);
    #print DF "== Content after charset trickery \n$content\n" if ($dumpfile);
    dbg("Attempted to fix utf-8 encoding. length: $origlen - " .
        length($contentstring) . " - " . length($content) );
  }

  if ( $content =~ /^ *(.*window\.(parent\.)?location *= *['"]\/.*$)/m ) {
    dbg("Looks like some javascript redirect might need proxying: $1");
  }
  if ( $content =~ /(\<[^>]*javascript:.*['"]\/[^'">]*['"][^>]*\>)/ ) {
    dbg("Looks like some javascript link might need proxying: $1");
  }

  # Check for assignment to document.domain (not just comparing with ==)
  if ( $content =~ /(document.domain\s*=[^=][^;\n>]+;?)/ ) {
    dbg("Looks like a script messes with document.domain. This may well go wrong: $1");
  }

  # mod_proxy_html adds a html tag in the beginning, of not there already.
  # Then the poor thing gets very confused if it sees another html tag.
  # So we must clean up any later html tags. 
  if ( ( $charsetsave =~ /html/i ) &&  # looks like a html document
       ( $content =~ /<[a-z].*<html/si ) ) {  # html that is not the first tag
    dbg("Looks like there are nested HTML tags. This may well go wrong");
    if ( $content =~ m/^(\s*(<![^>]+>\s*)?(<html)?)(.*?)(<\/html\s*>[^<]*)?$/si ) {
      my $start = $1; # possibly a <!DOCTYPE, then the opening html tag,
                    # but no other kind of tags. May well be empty.
      my $mid = $4; # Anything after that, up to but not including
      my $end = $5; # Optional closing html tag, not followed by any other tag
      dbg("Removing internal HTML tags");
      # Remove all html and /html tag(s) from the mid section
      $mid =~ s"<(/?html[^>]*)>"<!--\1-->"g;
      # and put the document togehter again
      $content = $start . $mid . $end;
    } else {
      dbg("Could not make sense of the structure, leaving tags as they are");
    }
  }
  print "\r\n" ;
  if ($c && $debug) {
    print "\n================ DEBUG ============= <br/>\n";
    print $c;
    print "\n================ CONTENT =========== <br/>\n";
  }
  print $content ;
  print DF "\n================ CONTENT =========== <br/>\n" . $content
    if ($dumpfile);
} # content



#################
# Error handling

# Error message. The code only goes into the headers, and
# can contain both a number and a text, as in "404 Not Found".
# The message is displayed on the screen.
sub errormsg {
  my ( $cgi, $code, $msg ) = @_;
  print $cgi->header(-status=>$code), $msg;
  print "<p/>$c" if $debug;
  if ( $dumpfile) {
    print DF "\n====== RETURNING ERROR ========= \n";
    print DF $cgi->header(-status=>$code), $msg
  }
  exit(0);  
}

# Makes a proper error page
sub errorpage {
  my ( $cgi, $code, $msg ) = @_;
  warn("$code  $msg"); # Get in apache error log too
  errormsg($cgi, $code,
    "<html><body>\n$msg\n</body></html>\n" );
}

# Makes a cproxy config error page
sub configerrorpage {
  my ( $cgi, $msg ) = @_;
  errorpage( $cgi, "500 Internal error - cproxy config",
    "<h1>500 Internal Error - cproxy config</h1>\n$msg" );
}
# Makes a "500 Internal Error"
sub internalerrorpage {
  my ( $cgi, $msg ) = @_;
  errorpage( $cgi, "500 Internal error",
    "<h1>500 Internal Error</h1>\n$msg" );
}

# Makes a "400 Bad request" page
sub requesterrorpage {
  my ( $cgi, $msg ) = @_;
  errorpage( $cgi, "400 Bad Request",
    "<h1>400 Bad Request</h1>\n$msg" );
}

# Debug routine
# Append message to $c (and from there to the screen) if $debug
# Print to dump file if $dumpfile
sub dbg {
  my $msg = shift;
  $c .= $msg . "<br/>\n" if ($debug);
  print DF $msg . "\n" if ($dumpfile);
}

#########

# Check if we have a directory /tmp/cf.XXXXX.dump, and if so,
# open a file for dumping all traffic there
sub checkdumpfile {
  my ( $ses, $url, $sesdir ) = @_;
  my $dumpdir = $sesdir . "/cf." . $ses . ".dump";
  $c .= "Checking dumpdir '$dumpdir' : " . ( (-d $dumpdir) || "0" ) . "<br/>\n"
    if $debug;
  if ( -d $dumpdir ) {
    my $fn = $url || "BAD_URL";
    $fn =~ s/^http://;
    $fn =~ s/\?.*//;  # drop parameters, way too long
    $fn =~ s/\//_/g;
    $fn =~ s/^_*//;
    $dumpfile = $dumpdir . "/" . $fn;
    if ( -f $dumpfile ) {
      my $suffix = 1;
      while ( -f $dumpfile . "." . $suffix ) {
        $suffix ++;
      }
      $dumpfile.= "." . $suffix;
    }
    open DF, ">$dumpfile"
      or internalerrorpage($cgi,"Could not open dump file '$dumpfile': $!");
    binmode(DF, ":raw");
    
    print DF "Cproxy dump " . `date +%c`;
    print DF "Session: $ses \nURL: $url \n";
  }
}



#########

sub dumpcgi {
    return unless $debug >1;
    my ($cgi) = @_;
    
    my $c .= "Env: <br/>";
    foreach ( keys(%ENV) ) {
        $c .= "'$_' : '" . $ENV{$_} . "'<br/>\n";
    }
    $c .= "<p/>\n";
    
    $c .= "Cookies: <br/>";
    foreach ( $cgi->cookie() ) {
        $c .= "'$_' : '" . $cgi->cookie($_) . "'<br/>\n";
    }
    $c .= "<p/>\n";
    return $c;
}


############
# Additional comments
#
# Cookie processing:
#
# Cookies have name=value, maybe a path, and optionally domain, which can
# be absolute (foo.com), or wildcard (.foo.com). That makes six
# combinations. We need all cookies point to the domain of the proxy, and
# have as good paths as possible.
# 1) name=value
#    n=v; domain=proxydomain; path=prefix/session/current-host/current-path
# 2) name=value; path=/foo
#    n=v; domain=proxydomain; path=prefix/session/current-host/foo
# 3) name=value; domain=some.com
#    n=v; domain=proxydomain; path=prefix/session/some.com
# 4) name=value; domain=.some.com
# THIS CAN NOT BE DONE 100%
# We would need a wildcard domain in the middle of our path! Best approach
# is to make two cookies, one for the whole some.com, and one for the current
# domain (which is supposed to be under.some.com). This still fails if the
# cookie is needed at other.some.com
#    n=v; domain=proxydomain; path=prefix/session/some.com
#    n=v; domain=proxydomain; path=prefix/session/under.some.com
# 5) name=value; path=/foo; domain=some.com
#    n=v; domain=proxydomain; path=prefix/session/hostname/foo
# 6) name=value; path=/foo; domain=.some.com
# THIS CAN NOT BE DONE 100%
# Same considerations as for #4. This seems a tad less likely situation.
#    n=v; domain=proxydomain; path=prefix/session/some.com/foo
#    n=v; domain=proxydomain; path=prefix/session/under.some.com/foo
#
# Observation: Those sites that use wildcard domains very often put the
# the path to plain "/", to make sure their global cookies reach everywhere.
# In such cases we can make a cookie with just
#    n=v; domain=proxydomain; path=prefix/session/
# and blindly let it apply for the whole session. This might take care of
# such cookies. In fact, we can do that for all cookies that have a wildcard
# domain, on the assumption that they may go wrong, but it is better to
# try. This will take the place of the alternative cookies set up in cases
# 4 and 6 above. In theory this alt cookie would be sufficient alone, but
# there is a risk that we meet several conflicting ones. Better emit a
# cookie for the current domain as well.
#
# So, to simplify:
#  * name=value stays always the same  (later we could check if the value
#    contains a domain, but I haven't come across such yet)
#  * domain is always our proxy domain. Keep the uppercasing of the 'domain'!
#  * All the magic happens in the path
#    * Always start with the prefix and session
#    * If we have a domain, it comes next. Otherwise the current domain
#    *  If we have a wildcard domain, make a second cookie header
#    * If we have a path, it comes next. (also on the second cookie header)
# Other things to consider:
#  * Keep the rest of the cookie line, for expiry times etc.
#
#


# Editor trickery
# vim: shiftwidth=4 tabstop=8 expandtab 
