# Apache2 config for the cfproxy
# The engine produces links like
#  http://cfproxy.indexdata.com/pre/fix/9999/www.google.com/s?q=water
# where
#   cfproxy.indexdata.com is the proxy host. Configured in /etc/cf-proxy/cproxy.cfg
#   /pre/fix/  is a constant prefix, also configured in /etc/cf-proxy/cproxy.cfg
#   /9999/ is the session id
#   www.google... is the url to be proxied

# The proxying is a two-level process. First we ask Apache to proxy all
# requests to the internal host. The internal host pipes everything into
# the proxy.pl script that re-establishes the session and fetches the page.
# Then it returns it to the apache proxying module, which will rewrite all
# links in it according to the rules below.

# The DNS needs to have an entry for the proxy host name. The proxy must run
# on the same machine as the cf-engine, since they share session files in /tmp
# but these can be known under different names anyway.

# There is also an internal hostname (cfproxy2.indexdata.com) which is used
# as a target for the apache proxying. It needs to resolve on the machine where
# this runs, but does not need to be publicly known.


# The host names need to be configured in this files, in three different
# places. These are marked with ####
# This file assumes Debian-like file locations. On Centos based systems,
# adjust the paths accordingly.

# Generic outgoing proxy, will proxy anything
# to cfproxy2.indexdata.com (see below), and rewrite urls.

<VirtualHost *:80>

  ####   Set host names and aliases here!  ####
  ServerName cfproxy.indexdata.com
  ServerAlias *.cfproxy.indexdata.com
  DocumentRoot /etc/cf-proxy
  CustomLog /var/log/apache2/cfproxy-access.log combined
  ErrorLog /var/log/apache2/cfproxy-error.log

  LogLevel Info
  
  # Disable compression, can't replace on that
  RequestHeader unset Accept-Encoding


  # Remember session and targethost from new-form URLs
  # New format: http://cfproxy.indexdata.com/pre/fix/999999/targethost/targetpath...
  SetEnvIf Request_URI ^/((.*?/)?[0-9]+)/([^/]+) PREFIX=$1
  SetEnvIf Request_URI ^/(.*?/)?([0-9]+)/[^/]+ SESSION=$2
  SetEnvIf Request_URI ^/(.*?/)?[0-9]+/([^/]+) TARGETHOST=$2

  # Remember the first element in the path for later rewrite
  # This is for URLS in the old format: http://99999.cproxy.indexdata.com/...
  SetEnvIf Request_URI ^/?([^/]+) REALHOST=$1

  ProxyRequests Off
  <Proxy *>
    Order deny,allow
    Allow from all
  </Proxy>

  ProxyHTMLInterp On

  #### Set the internal hostname here  ####
  ProxyPass         / http://cfproxy2.indexdata.com/
  ProxyPassReverse  / http://cfproxy2.indexdata.com/

  # Debug logging. Not very useful
  # ProxyHTMLLogVerbose On

  # Enable extended proxying (doesn't seem to help much)
  ProxyHTMLExtended On
  #### 2.14.6 change!!!
  # Is needed for some sites!

  SetOutputFilter proxy-html

  <Location />

      # Debug to see that variables are set up right:
      #ProxyHTMLURLMap  /  /R=${REALHOST|NoEnvVar}/P=${PREFIX|NoPrefix}/S=${SESSION|NoSession}/T=${TARGETHOST|NoTargetHost}/ VL

      # New mapping
      # a+b) absolute link http://foo.com. Make relative (so the browser
      # will add the proxy host), and add prefix and session in the beginning.
      # The target host will be in the URI already.
      # ProxyHTMLURLMap  http://      /${PREFIX|NoPrefix}/ V^L  TARGETHOST
      ProxyHTMLURLMap  (.*)http://      $1/${PREFIX|NoPrefix}/ VRL  TARGETHOST
      #### 2.14.6 change!!!


      # c) Relative link. Add both prefix,session and host. The browser will
      # add the proxyhost.
      ProxyHTMLURLMap  /  /${PREFIX|NoPrefix}/${TARGETHOST|NoTargetHost}/ V^L TARGETHOST

      # d) plain link "images/foo.png" - do not touch. The browser will do it all

      # Old mapping:
      ProxyHTMLURLMap  /      /${REALHOST|NoEnvVar}/ V  !TARGETHOST
      ProxyHTMLURLMap http:// /                      ^  !TARGETHOST

      # The options are
      #   V for expanding variables
      #   L for last match, don't try more rules if this matches
      #   ^ match beginning of url
      #   R regexp
      # The environment variables should always be there, the alternatives
      # ("NoSession" etc) are mostly a debugging aid.
      # The last element is a condition that switches between new and old form.

  </Location>

</VirtualHost>


# cfproxy2, that runs our own proxying script
<VirtualHost *:80>

  #### Set the internal hostname here  ####
  ServerName cfproxy2.indexdata.com
  ServerAlias cfproxy2 
  DocumentRoot /usr/share/cf-proxy
  CustomLog /var/log/apache2/cfproxy2-access.log combined
  ErrorLog /var/log/apache2/cfproxy2-error.log

  LogLevel Info
  
  # Redirect everything to our proxy script
  AliasMatch ^/ /usr/share/cf-proxy/proxy.pl
  
  <Directory /usr/share/cf-proxy>
    Options ExecCGI
    AllowOverride None
    AddHandler cgi-script .pl
  </Directory>
</VirtualHost>
