#!/home/rebelsky/perl/bin/perl
# ============================================================================
# Ravel - Ravel server for filter and processing web pages. The Ravel server
# loads a set of plugins and then sits waiting for incoming HTTP
# client request. When it recieves these, depending on the user and
# the location, it calls some of the plugins to process the page in
# some manner, possibly altering it in the process. When done, it
# returns it to the browser.
#
# Logging in and account information: The account server is
# implemented as an CGI program that operates through a standard
# web server. This allows the Ravel server to use the standard Perl
# web libraries to send the login information and retrieve the user
# information. It also makes it a relatively simple affair to code
# the account server and system for changing user account settings;
# they can all be done through the well know CGI mechanisms. The
# account server takes two parameters: UserID and Password. The
# account server should return either an empty page if the login
# failed or the user data if successfull. The user data is stored
# in a heirarchical XMLish format. For example, the user ID is
# stored as ... This makes it a simple matter
# to retrieve data from. The current tags defined for this format
# are:
#
# - Contains the ID of the user
# - The users password for logging in to Ravel
# - User's e-mail address
# - Comma delimited list of plugins to be
# applied, listed in the order that they are to be
# applied
# - Comma delimited list of groups that the
# user belongs to
#
# Plugin system: The plugins are Perl modules that are stored in a
# specific directory. (This directory can be set either through a
# command line option or through the configuration file.) They must
# have the .pm extension. When the server starts up it scans this
# directory for these files and loads all of them up into Perl,
# placing them in their own packages. (The package is named for the
# filename of the perl module. Thus a plugin called Annotate.pm would
# be placed into a package called Annotate.) Each of these modules
# must define the procedure Execute. Execute takes four parameters
# and does not need to return anything. The first parameter that it
# takes is the user ID of the user who requested the page. The second
# parameter is the full string giving the information on the user who
# requested it, in the account format listed above. The third
# parameter is an HTTP::Request object giving the request from the
# users client. Finally, the fourth parameter is an HTTP::Response
# object. This last object contains the page returned from the
# web server and modified by the other plugins to that point. The
# plugin is free to call any methods on that object to change the
# contents of the page or any other information. When all of the
# plugins have been executed, it will return the response to the
# client.
#
# Exclusion System: A set of rules may be given by a web page to
# dictate what plugins may be used with a web page and by whom. These
# plugins come from two different sources. The first source is the
# web page itself. A rule is embeded on a page using a meta tag of
# the format where the string in the
# content part list the rule. The second source is a file called
# ravel.txt located in the web server root directory. It consist of a
# series of lines, each listing a single rule. Rules listed later take
# precedence over rules listed before and all of the rules listed in
# the ravel.txt file take precedence over rules on a pages meta tag.
# The format of these rules is either:
#
# allow: user-group-plugin-path
# or
# disallow: user-group-plugin-path
#
# The path is the path name from the URL including the leading slash.
# The second part of the string, the one with the hyphens must be a
# Perl regular expression for matching the different attributes. The
# server will try to match this against each combination of groups
# and plugins for each rule. If the first part of the rule was
# disallow, then a plugin will be disabled if the server can find a
# match. If it says allow, then a plugin will be allowed if a rule
# matches for it. Matching is case insensitive. For example:
#
# disallow: .*?-.*?-.*?-.*?$
# allow: .*?-Glimmer-.*?-.*?$
#
# This pair of rules will disable all plugins for regular users, but
# allow anyone in the Glimmer group to view the page with the plugins
# enabled.
#
# Author: Andrew Kensler
#
# Current Version: 0.09
#
# History: 6/30/99 - Created
# - Wrote first version of GetConfig, ProcessClient,
# and MainLoop
# 7/1/99 - Modified GetConfig to use GetOpt package to parse
# the command line for options
# - Added status messages to GetConfig and MainLoop and
# error checking to MainLoop
# - Added InitDaemon mode to go become a daemon like
# background process, plus the configuration option
# to do this.
# - Added display banner.
# 7/2/99 - Added option to chain through another proxy.
# - Began user validation code, GetUser, SendLoginForm.
# 7/4/99 - Fixed an "connection reset by peer" problem with
# SendLoginForm by slurping up request.
# - Changed SendLoginForm to send a file instead of a
# hardcoded page. (Added configuration option for this.)
# - Continued work on log in process in general.
# 7/7/99 - Finished login code.
# 7/8/99 - Wrote code to load plugins and then apply them.
# 7/9/99 - Started PluginList and modified ApplyPlugins so that the
# list of plugins is filtered by rules.
# 7/13/99 - Finished PluginList code for applying exclusion rules.
# - Changed configuration code to read a configuration file.
# 7/16/99 - Using with functioning account server.
# - Fixed (hopefully) the problem with the child processes
# not terminating. Hopefully this will lead to better
# memory usage and resource usage as well.
#
# To Do List: - http://www.math.grin.edu/~heckr/ work but /~heckr does not.
# Why? It looks like it is messing up on the base URL.
# - The server seems to be using incorrect plugins or disregarding
# which plugin it should actually be running. Need to fix this.
# It's probably getting tripped up by the packages. (Or at least
# it acts that way.
# - Work on better handling of addresses (/ vs. /index.html) While
# the servers usually resolve / into /index.html, Ravel needs to
# do this automatically for the benefit of the plugins, so that
# a distinct URL is a distinct URL.
# - Switch to use the proxy-authenticate codes for the user login
# code. This has the the advantage of avoiding the messy
# caching and reload situation. Basic authorization should be
# sufficient. (See Web Client Programming With Perl, p. 62)
# - Better reload on log in. It acts funny the first time you
# get a page after logging in. This is because the browser
# caches the page in memory, but it is caching the form or the
# okay. When you do reload, it wants to send the POST info
# to the server, but the server doesn't like this. If nothing
# else, change the login pages so they expire the browser cache.
# - Create some way of logging out. Possibly use virtual server
# scheme and access a fake URL to do this. (Might ease log in
# problems too.)
# - Error checking; make it more robust and bullet proof.
# - What happens when you try to use SSL and an encrypted web
# connection?
# - Change the content length to reflect the new length after
# filtering. Not strictly nescessary to actually work but
# it is needed to bring up to snuff with the HTTP RFC
# ============================================================================
# Package Imports
use strict; # Strict checking mode
use Getopt::Long; # Procedure to parse command line
use Sys::Hostname; # Determine our hostname
use HTTP::Daemon; # Server daemon package
use HTTP::Status; # Exports convenient response code constants
use LWP::UserAgent; # Fetch pages for us
# ============================================================================
# Global Variables
my $server_addr; # Local address to use for the server
my $server_port; # Port to listen on
my $daemon_mode; # Whether or not to become a background daemon
my $use_proxy; # Whether or not to go through another proxy
my $time_out; # Seconds to go before timing out accounts
my $login_form_file; # Location of the login form to send
my $ok_login_page; # Location of page to send if the login is ok
my $bad_login_page; # Location of page to send if the login is bad
my $account_server; # URL string to the account server
my $plugin_path; # Path to the directory for plugins
my %connection_data; # Hash to record data on the active connections
my %user_info; # Hash to record information on current users
# ============================================================================
# DisplayBanner - Display the Web Raveler banner.
# Input: None
# Output: None
sub DisplayBanner
{
print '___________ _____ _____ _____ _____ _____ ___________'."\n";
print '__________ \/ ___ \/ ___ \/ ___ \/ ___ \/ ___ \/ __________'."\n";
print ' \/ / R \/ / A \/ / V \/ / E \/ / L \/ / '."\n";
print '__________/ /\___/ /\___/ /\___/ /\___/ /\___/ /\__________'."\n";
print '___________/\_____/\_____/\_____/\_____/\_____/\___________'."\n";
print "Web Raveler Server, V0.09.\n";
print "By Andrew Kensler.\n";
print "\n";
}
# ============================================================================
# GetConfig - Set the values of certain configuration variables. Then it
# executes the configuration file. Finally, it calls a procedure
# in the GetOpt package to parse the command line to get the
# information.
# Input: None
# Output: None
sub GetConfig
{
# Print status information
print "Getting configuration information...\n";
# Set up defaults for the options
$server_addr=hostname; # Local address for server to be on
$server_port=54321; # Port to listen on
$daemon_mode=0; # Don't go into daemon mode by default
$use_proxy=0; # Don't go through another proxy by default
$time_out=1200; # Default in seconds to time out account
$login_form_file="LoginForm.html"; # Filename of the login form to return
$ok_login_page="LoginOk.html"; # Filename of the page to return after login
$bad_login_page="LoginBad.html"; # Page to return on bad login
$plugin_path="./Plugins"; # Path to the plugins directory
# FIXME: DEFAULT ACCOUNT SERVER IS HARDCODED IN RIGHT NOW
$account_server="http://www.math.grin.edu/~kensler/Research/Account/AccountServer.cgi";
# Load the configuration file. The configuration file should contain
# Perl code like that above for setting defaults. Any valid Perl code
# is fine, but you'll probably want to limit it to setting the global
# variables and comments. However you could always use this to do
# something like prompt the user for these settings. This mechanism
# for configuration files was suggested by Perl guru Tom Christiansen.
do '$ENV{HOME}/.Ravel';
# Parse the command line
Getopt::Long::GetOptions
"address=s"=>\$server_addr,
"port=i"=>\$server_port,
"daemon"=>\$daemon_mode,
"proxy"=>\$use_proxy,
"timeout=i"=>\$time_out,
"loginform=s"=>\$login_form_file,
"oklogin=s"=>\$ok_login_page,
"badlogin=s"=>\$bad_login_page,
"accountserver=s"=>\$account_server,
"pluginpath=s"=>\$plugin_path;
}
# ============================================================================
# InitDaemonMode - This turns the program into a daemon. It does this by
# forking off a child process and terminating the parent
# process. This puts the program into the back ground. Next
# for the child, it sets it to trap hang up messages
# Input: None
# Output: None
sub InitDaemonMode
{
# Print status information
print "Going into the background...\n";
# Fork off a child process to handle it. The parent should terminate,
# leaving the child process running.
if (fork()!=0)
{
exit 0;
}
# The child process should ignore hangup signals. This way, when the user
# logs out, it continues running in the background. It continues running
# until it is killed.
$SIG{HUP}='IGNORE';
}
# ============================================================================
# LoadPlugins - Scan the plugin directory for Perl module files and load them
# into their own package namespace.
# Input: None
# Output: None
sub LoadPlugins
{
# Local Variables
my $plugin_file; # The fully qualified file name for a plugin
my $plugin; # The bare plugin name without a path or such
# Print status information.
print "Loading plugins...\n";
# Scan the directory for plugins.
while($plugin_file=<$plugin_path/*.pm>)
{
$plugin_file=~/([^\/]*).pm$/;
$plugin=$1;
# Print more status information.
print " -> $plugin\n";
# Load the plugin into its own namespace.
eval "package $plugin;";
require $plugin_file;
package main;
}
}
# ============================================================================
# SendLoginForm - Send an HTML form to the client connection requesting the
# user ID and password for them to log in.
# Input: client - A client connection (HTTP::Daemon::ClientConn)
# Output: None
sub SendLoginForm
{
# Parameters
my $client=shift; # Client connection (HTTP::Daemon::ClientConn)
# Local Variables
my $client_request; # Request from the client (HTTP::Request)
# We need to get the client requests so as to placate the browser. If we
# don't slurp up the HTTP requests, then Netscape generates an error
# message saying "connection reset by peer," which gets rather annoying.
while ($client_request=$client->get_request) {}
# Now send the login form from the file back to the client.
$client->send_file_response($login_form_file);
}
# ============================================================================
# UserLogin - Read the client request and extract the user log in information
# from it. Then pass that to the user account server to try to
# get the information for that account, if possible.
# Input: client - A client connection (HTTP::Daemon::ClientConn)
# user_agent - To access the account server (LWP::UserAgent)
# Output: User ID of the user at the client end (String)
# or undef if they still need to be authenticated or could not.
sub UserLogin
{
# Parameters
my $client=shift; # Client connection (HTTP::Daemon::ClientConn)
my $user_agent=shift; # User agent to fetch pages (LWP::UserAgent)
# Local variables
my $login_request; # Request from the client (HTTP::Request)
my $server_url; # URL to the account server (URI::URL)
my $login_response; # Response from the server (HTTP::Server)
my $info; # User information (String)
# Get the request from the client. This should contain the information
# that we need. Then we create a new URL object to point to the account
# server and put it back in the URL. This way we redirect the request to
# the account server, but we leave the content alone so that the user
# information is left alone.
$login_request=$client->get_request;
$server_url=new URI::URL($account_server);
$login_request->url($server_url);
# Now fetch the information from the account server and put it into a
# string to work with.
$login_response=$user_agent->request($login_request);
$info=$login_response->content;
# If it was a bad login try, then the server will respond with an empty
# page. If we get that, or fail to contact the server, then we need to
# send back a page saying that the login has failed. We also return an
# undef to indicate that the login failed.
if (($info eq "")||$login_response->is_error)
{
$client->send_file_response($bad_login_page);
return undef;
}
# Otherwise, the login is all right. So we return a page saying that
# the user has successfully logged in. We stuff their account info
# into the user information hash and then return their user id.
else
{
$client->send_file_response($ok_login_page);
$info=~/(.*?)<\/USERID>/;
$user_info{$1}=$info;
return $1;
}
}
# ============================================================================
# GetUser - Look to see if there is a current entry in the hash of IP
# addresses for a given connection. If there is not, then we
# bounce back a form asking for the user to login.
# Input: client - A client connection (HTTP::Daemon::ClientConn)
# user_agent - To access the account server (LWP::UserAgent)
# Output: User ID of the user at the client end (String)
# or undef if we're still doing authentication stuff.
sub GetUser
{
# Parameters
my $client=shift; # Client connection (HTTP::Daemon::ClientConn)
my $user_agent=shift; # User agent to fetch pages (LWP::UserAgent)
# Local Variables
my $status; # Status of the client (string)
my $last_time; # Time of last activity from client (integer)
my $user_id; # User id
# Put the connection status info into an easier variable.
$status=$connection_data{$client->peerhost}{"status"};
# If the user is in the loging in state, then the info that they send
# is going to contain the user name and password for them. So we need to
# process this
if ($status eq "logingin")
{
$user_id=UserLogin($client,$user_agent);
if ($user_id)
{
$connection_data{$client->peerhost}{"time"}=time;
$connection_data{$client->peerhost}{"user"}=$user_id;
$connection_data{$client->peerhost}{"status"}="loggedin";
}
return undef;
}
# Look up the last access time. If there hasn't been a connection from
# that address before or the last activity was longer than the time out
# time then tell them that they will have to login in again. We also
# give them the logging in status so when the click the button, we catch
# their data.
$last_time=$connection_data{$client->peerhost}{"time"};
if ((!$status)||((time-$last_time)>$time_out))
{
SendLoginForm $client;
$connection_data{$client->peerhost}{"status"}="logingin";
return undef;
}
# Otherwise, we can update their last access time entry and return the
# user ID associated with that connection.
else
{
$connection_data{$client->peerhost}{"time"}=time;
return $connection_data{$client->peerhost}{"user"};
}
}
# ============================================================================
# PluginList - This extract the list of plugins from the user info and then
# applies the exclusion rules associated with the page to the
# list to remove all of the plugins that should be excluded.
# The returned list is the list of plugins that are approved
# to be run on this page by the user.
# Input: user - The ID of the user for this request (String)
# user_agent - User agent to fetch rules (LWP::UserAgent)
# client_request - Request from the client (HTTP::Request)
# server_response - The response from the server (HTTP::Response)
# Output: None
sub PluginList
{
# Parameters
my $user=shift; # ID of the user for this request (String)
my $user_agent=shift; # User agent to fetch rules (LWP::UserAgent)
my $client_request=shift; # Request from the client (HTTP::Request)
my $server_response=shift; # The response from the server (HTTP::Response)
# Local Variables
my $headers; # Header from the response (HTTP::Headers)
my $content_type; # The content-type from the header (String)
my $html; # The HTML contents of the page (String)
my $url; # The URL of the request (URI::URL)
my $rules_request; # A request for the rules file (HTTP::Request)
my $rules_response; # Rules returned by the server (HTTP::Response)
my @rule_list; # List of rules controlling plugins (Array)
my $rule; # One of the rules in the list (String)
my @plugin_list; # List of plugins in order to apply (Array)
my $plugin; # One of the plugins in the list (String)
my @group_list; # List of groups this user is in (Array)
my $group; # One of the groups in the list
my $path_name; # The directory path from the URL (String)
my $state; # The state to set a plugin to (String)
my $pattern; # A rules pattern to try to match (String)
my $line; # A line to try to match by a rule (String)
my %plugin_state; # On/off state for each plugin (Hash)
# Is this an HTML page? If so, we scan through the HTML on the page and
# look for meta tags that correspond to Ravel rules. Then we extract the
# rule and push it onto an array.
$headers=$server_response->headers;
$content_type=$headers->header("content-type");
if ($content_type eq "text/html")
{
$html=$server_response->content;
while($html=~/<\s*meta\s+name\s*=\s*"?Ravel"?\s+content\s*=\s*"?(.*?)"?\s*>/gi)
{
push(@rule_list,$1);
}
}
# Now try to get the rules from a ravel.txt file in the root directory on
# the server. If there is such a file, we add each line to the list of
# rules.
$url=$client_request->url;
# FIXME: QUIT BANGING FOR NONEXISTENT FILES ON THE SERVERS WHILE WE TEST
# $rules_request=new HTTP::Request("GET","http://".$url->host."/ravel.txt");
# $rules_response=$user_agent->request($rules_request);
# if ($rules_response->is_success)
# {
# $html=$rules_response->content;
# push(@rule_list,split(/\n/,$html));
# }
# Extract the list of plugins from the user info.
$user_info{$user}=~/(.*?)<\/PLUGINLIST>/;
@plugin_list=split(/,/,$1);
# Extract the list of groups the user is in.
$user_info{$user}=~/(.*?)<\/GROUPLIST>/;
@group_list=split(/,/,$1);
# Extract the directory and file path from the request URL.
$path_name=$url->path;
# Now for each of the rules, we apply extract the state that should be set
# and the pattern to match to put a plugin into that state. Then we
# construct lines describing each possible group and plugin and try to see
# if we have a match. If we do, then for the time being, we set the plugin
# to whatever state was described for the time being
foreach $rule (@rule_list)
{
$rule=~/(.*?):\s*(.*)/;
$state=$1;
$pattern=$2;
foreach $plugin (@plugin_list)
{
foreach $group (@group_list)
{
$line="$user-$group-$plugin-$path_name";
if ($line=~/$pattern/i)
{
$plugin_state{$plugin}=$state;
last;
}
}
}
}
# Finally, we weed out all of the plugins in the list with entries in the
# state hash that say "disallow". We finish by returning this culled list
# of plugins to use.
@plugin_list=grep $plugin_state{$_}ne"disallow", @plugin_list;
return @plugin_list;
}
# ============================================================================
# ApplyPlugins - For each client request, this is called with the name of the
# user, the client's request and the server's response. It is
# free to do with them as it wishes. Namely, it can invoke
# methods on the response to change the content before the
# server returns them.
# Input: user - The ID of the user for this request (String)
# user_agent - User agent to fetch rules (LWP::UserAgent)
# client_request - Request from the client (HTTP::Request)
# server_response - The response from the server (HTTP::Response)
# Output: None
sub ApplyPlugins
{
# Parameters
my $user=shift; # ID of the user for this request (String)
my $user_agent=shift; # User agent to fetch rules (LWP::UserAgent)
my $client_request=shift; # Request from the client (HTTP::Request)
my $server_response=shift; # The response from the server (HTTP::Response)
# Local Variables
my @plugin_list; # The plugins to apply in list format.
my $plugin; # The current plugin to work on
# Grab the list of plugins to apply. The list is returned in the order
# to apply them and with all of the rules in the HTML and from the server
# about what plugins to allow taken into account.
@plugin_list=PluginList($user,$user_agent,$client_request,$server_response);
# Loop for each of the plugins in the list and in order and execute
# the plugin on the data. We pass the ID of the user, the client's
# request, and the server's response. The plugin can invoke any
# methods that it likes to alter the server response. This object will
# get returned to the client.
foreach $plugin (@plugin_list)
{
{$plugin}::Execute($user,$user_info{$user},$client_request,$server_response);
}
}
# ============================================================================
# ProcessClient - Given a client, this will do all of the things nescessary
# to handle it. One of the key things that it will do is to
# fork of a child process to handle it.
# Input: client - A client connection (HTTP::Daemon::ClientConn)
# user_agent - User agent used to fetch pages (LWP::UserAgent)
# Output: None
sub ProcessClient
{
# Parameters
my $client=shift; # Client connection (HTTP::Daemon::ClientConn)
my $user_agent=shift; # User agent to fetch pages (LWP::UserAgent)
# Local Variables
my $pid; # Process ID from fork call (Integer)
my $client_request; # Request from the client (HTTP::Request)
my $server_response; # The response from the server (HTTP::Response)
my $user; # The ID of the user (String)
# Get the user ID associated with this client. If we don't get one back
# then close the connection and return; they've been served by the login
# routines.
if (!($user=GetUser($client,$user_agent)))
{
close $client;
return;
}
# Fork off a child process to handle it. The parent will return to the
# main loop to recieve more request.
$pid=fork;
if ($pid!=0)
{
close $client;
return;
}
# Keep fetching client requests while there are any to be had.
while ($client_request=$client->get_request)
{
# Pass the request to the server to try to get a response. We use the
# simple_request method instead of the request method, because we the
# request method automatically intercepts redirect responses and
# requests the page the redirect points to. The browser never catches
# the redirect and so its expansion of relative URLs gets screwed up.
# So we use the simple_request method because it leaves handling of
# the redirects to the browser. Lesson learned.
$server_response=$user_agent->simple_request($client_request);
# Now run it through the plugins.
ApplyPlugins($user,$user_agent,$client_request,$server_response);
# Pass the response back to the client.
$client->send_response($server_response);
}
# We're done, so close the connection with the client.
close $client;
}
# ============================================================================
# MainLoop - This is the main server loop. It sets itself up to listen on the
# given port for client connections and calls ProcessRequest to
# deal with each one of them.
# Input: None
# Output: None
sub MainLoop
{
# Local Variables
my $user_agent; # User agent to fetch pages (LWP::UserAgent)
my $server_daemon; # Server daemon object (HTTP::Daemon)
my $client; # Client connection (HTTP::Daemon::ClientConn)
# Print out status information.
print "Starting up server...\n";
# We want to ignore signals from the child processes that we are going
# to spawn. We don't actually need to do anything; ignoring them is
# sufficient to acknowledge them. But UNIX can be weird about such things
# and this theoretically prevents zombie child processes, which is a Good
# Thing, considering the resource consumption of Perl and the limited
# process table.
$SIG{CHLD}='IGNORE';
# Set up a user agent for the child processes to use to fetch the pages
# from the servers. This way it will be ready for the child processes
# to use when they need it.
$user_agent=new LWP::UserAgent;
$user_agent->agent("Ravel/0.09");
# If the user wants this to chain through another proxy, they can load
# the environment variables, which this will retrieve and use to set
# itself to go through the given proxies.
$user_agent->env_proxy if $use_proxy;
# Create a daemon object to list for incoming client request. We'll set
# it to listen on the given port.
$server_daemon=new HTTP::Daemon
LocalAddr=>$server_addr,
LocalPort=>$server_port;
# Tell the user that we are successfully running.
print "Server active at ".$server_daemon->url."\n";
# Here is the main loop. We block on accept and wait for client
# connections. For each one, we call ProcessRequest to handle it.
while($client=$server_daemon->accept)
{
ProcessClient($client,$user_agent);
}
}
# ============================================================================
# Main Sequence
DisplayBanner; # Display our banner to the user.
GetConfig; # Get the configuration information.
InitDaemonMode if $daemon_mode; # Go into the background daemon style if desired.
LoadPlugins; # Load up all of the plugins
MainLoop; # Execute the server loop.