#!/home/rebelsky/perl/bin/perl # ============================================================================ # Ravel - Ravel server for filter and processing web pages. The Ravel server # loads a set of plugins and then sits waiting for incoming HTTP # client request. When it recieves these, depending on the user and # the location, it calls some of the plugins to process the page in # some manner, possibly altering it in the process. When done, it # returns it to the browser. # # Logging in and account information: The account server is # implemented as an CGI program that operates through a standard # web server. This allows the Ravel server to use the standard Perl # web libraries to send the login information and retrieve the user # information. It also makes it a relatively simple affair to code # the account server and system for changing user account settings; # they can all be done through the well know CGI mechanisms. The # account server takes two parameters: UserID and Password. The # account server should return either an empty page if the login # failed or the user data if successfull. The user data is stored # in a heirarchical XMLish format. For example, the user ID is # stored as ... This makes it a simple matter # to retrieve data from. The current tags defined for this format # are: # # - Contains the ID of the user # - The users password for logging in to Ravel # - User's e-mail address # - Comma delimited list of plugins to be # applied, listed in the order that they are to be # applied # - Comma delimited list of groups that the # user belongs to # # Plugin system: The plugins are Perl modules that are stored in a # specific directory. (This directory can be set either through a # command line option or through the configuration file.) They must # have the .pm extension. When the server starts up it scans this # directory for these files and loads all of them up into Perl, # placing them in their own packages. (The package is named for the # filename of the perl module. Thus a plugin called Annotate.pm would # be placed into a package called Annotate.) Each of these modules # must define the procedure Execute. Execute takes four parameters # and does not need to return anything. The first parameter that it # takes is the user ID of the user who requested the page. The second # parameter is the full string giving the information on the user who # requested it, in the account format listed above. The third # parameter is an HTTP::Request object giving the request from the # users client. Finally, the fourth parameter is an HTTP::Response # object. This last object contains the page returned from the # web server and modified by the other plugins to that point. The # plugin is free to call any methods on that object to change the # contents of the page or any other information. When all of the # plugins have been executed, it will return the response to the # client. # # Exclusion System: A set of rules may be given by a web page to # dictate what plugins may be used with a web page and by whom. These # plugins come from two different sources. The first source is the # web page itself. A rule is embeded on a page using a meta tag of # the format where the string in the # content part list the rule. The second source is a file called # ravel.txt located in the web server root directory. It consist of a # series of lines, each listing a single rule. Rules listed later take # precedence over rules listed before and all of the rules listed in # the ravel.txt file take precedence over rules on a pages meta tag. # The format of these rules is either: # # allow: user-group-plugin-path # or # disallow: user-group-plugin-path # # The path is the path name from the URL including the leading slash. # The second part of the string, the one with the hyphens must be a # Perl regular expression for matching the different attributes. The # server will try to match this against each combination of groups # and plugins for each rule. If the first part of the rule was # disallow, then a plugin will be disabled if the server can find a # match. If it says allow, then a plugin will be allowed if a rule # matches for it. Matching is case insensitive. For example: # # disallow: .*?-.*?-.*?-.*?$ # allow: .*?-Glimmer-.*?-.*?$ # # This pair of rules will disable all plugins for regular users, but # allow anyone in the Glimmer group to view the page with the plugins # enabled. # # Author: Andrew Kensler # # Current Version: 0.09 # # History: 6/30/99 - Created # - Wrote first version of GetConfig, ProcessClient, # and MainLoop # 7/1/99 - Modified GetConfig to use GetOpt package to parse # the command line for options # - Added status messages to GetConfig and MainLoop and # error checking to MainLoop # - Added InitDaemon mode to go become a daemon like # background process, plus the configuration option # to do this. # - Added display banner. # 7/2/99 - Added option to chain through another proxy. # - Began user validation code, GetUser, SendLoginForm. # 7/4/99 - Fixed an "connection reset by peer" problem with # SendLoginForm by slurping up request. # - Changed SendLoginForm to send a file instead of a # hardcoded page. (Added configuration option for this.) # - Continued work on log in process in general. # 7/7/99 - Finished login code. # 7/8/99 - Wrote code to load plugins and then apply them. # 7/9/99 - Started PluginList and modified ApplyPlugins so that the # list of plugins is filtered by rules. # 7/13/99 - Finished PluginList code for applying exclusion rules. # - Changed configuration code to read a configuration file. # 7/16/99 - Using with functioning account server. # - Fixed (hopefully) the problem with the child processes # not terminating. Hopefully this will lead to better # memory usage and resource usage as well. # 7/22/99 - I was stupid. I forgot to explicitely make the child # process exit. Doh!! No wonder it was leaving zombies. # # To Do List: - The server seems to be using incorrect plugins or disregarding # which plugin it should actually be running. Need to fix this. # It's probably getting tripped up by the packages. (Or at least # it acts that way. # - Work on better handling of addresses (/ vs. /index.html) While # the servers usually resolve / into /index.html, Ravel needs to # do this automatically for the benefit of the plugins, so that # a distinct URL is a distinct URL. # - Switch to use the proxy-authenticate codes for the user login # code. This has the the advantage of avoiding the messy # caching and reload situation. Basic authorization should be # sufficient. (See Web Client Programming With Perl, p. 62) # - Better reload on log in. It acts funny the first time you # get a page after logging in. This is because the browser # caches the page in memory, but it is caching the form or the # okay. When you do reload, it wants to send the POST info # to the server, but the server doesn't like this. If nothing # else, change the login pages so they expire the browser cache. # - Create some way of logging out. Possibly use virtual server # scheme and access a fake URL to do this. (Might ease log in # problems too.) # - Error checking; make it more robust and bullet proof. # - What happens when you try to use SSL and an encrypted web # connection? # - Change the content length to reflect the new length after # filtering. Not strictly nescessary to actually work but # it is needed to bring up to snuff with the HTTP RFC # ============================================================================ # Package Imports use strict; # Strict checking mode use Getopt::Long; # Procedure to parse command line use Sys::Hostname; # Determine our hostname use HTTP::Daemon; # Server daemon package use HTTP::Status; # Exports convenient response code constants use LWP::UserAgent; # Fetch pages for us # ============================================================================ # Global Variables my $server_addr; # Local address to use for the server my $server_port; # Port to listen on my $daemon_mode; # Whether or not to become a background daemon my $use_proxy; # Whether or not to go through another proxy my $time_out; # Seconds to go before timing out accounts my $login_form_file; # Location of the login form to send my $ok_login_page; # Location of page to send if the login is ok my $bad_login_page; # Location of page to send if the login is bad my $account_server; # URL string to the account server my $plugin_path; # Path to the directory for plugins my %connection_data; # Hash to record data on the active connections my %user_info; # Hash to record information on current users # ============================================================================ # DisplayBanner - Display the Web Raveler banner. # Input: None # Output: None sub DisplayBanner { print '___________ _____ _____ _____ _____ _____ ___________'."\n"; print '__________ \/ ___ \/ ___ \/ ___ \/ ___ \/ ___ \/ __________'."\n"; print ' \/ / R \/ / A \/ / V \/ / E \/ / L \/ / '."\n"; print '__________/ /\___/ /\___/ /\___/ /\___/ /\___/ /\__________'."\n"; print '___________/\_____/\_____/\_____/\_____/\_____/\___________'."\n"; print "Web Raveler Server, V0.09.\n"; print "By Andrew Kensler.\n"; print "\n"; } # ============================================================================ # GetConfig - Set the values of certain configuration variables. Then it # executes the configuration file. Finally, it calls a procedure # in the GetOpt package to parse the command line to get the # information. # Input: None # Output: None sub GetConfig { # Print status information print "Getting configuration information...\n"; # Set up defaults for the options $server_addr=hostname; # Local address for server to be on $server_port=54321; # Port to listen on $daemon_mode=0; # Don't go into daemon mode by default $use_proxy=0; # Don't go through another proxy by default $time_out=1200; # Default in seconds to time out account $login_form_file="LoginForm.html"; # Filename of the login form to return $ok_login_page="LoginOk.html"; # Filename of the page to return after login $bad_login_page="LoginBad.html"; # Page to return on bad login $plugin_path="./Plugins"; # Path to the plugins directory # FIXME: DEFAULT ACCOUNT SERVER IS HARDCODED IN RIGHT NOW $account_server="http://www.math.grin.edu/~kensler/Research/Account/AccountServer.cgi"; # Load the configuration file. The configuration file should contain # Perl code like that above for setting defaults. Any valid Perl code # is fine, but you'll probably want to limit it to setting the global # variables and comments. However you could always use this to do # something like prompt the user for these settings. This mechanism # for configuration files was suggested by Perl guru Tom Christiansen. do '$ENV{HOME}/.Ravel'; # Parse the command line Getopt::Long::GetOptions "address=s"=>\$server_addr, "port=i"=>\$server_port, "daemon"=>\$daemon_mode, "proxy"=>\$use_proxy, "timeout=i"=>\$time_out, "loginform=s"=>\$login_form_file, "oklogin=s"=>\$ok_login_page, "badlogin=s"=>\$bad_login_page, "accountserver=s"=>\$account_server, "pluginpath=s"=>\$plugin_path; } # ============================================================================ # InitDaemonMode - This turns the program into a daemon. It does this by # forking off a child process and terminating the parent # process. This puts the program into the back ground. Next # for the child, it sets it to trap hang up messages # Input: None # Output: None sub InitDaemonMode { # Print status information print "Going into the background...\n"; # Fork off a child process to handle it. The parent should terminate, # leaving the child process running. if (fork()!=0) { exit 0; } # The child process should ignore hangup signals. This way, when the user # logs out, it continues running in the background. It continues running # until it is killed. $SIG{HUP}='IGNORE'; } # ============================================================================ # Reaper - This routine is responsible for helping to kill of child processes. # It reinstalls itself to handle the next signal, just in case it is # running on a SysV system and calls wait so that the child process # can exit correctly. # Input: None # Output: None sub Reaper { $SIG{CHLD} = \&Reaper; wait; } # ============================================================================ # LoadPlugins - Scan the plugin directory for Perl module files and load them # into their own package namespace. # Input: None # Output: None sub LoadPlugins { # Local Variables my $plugin_file; # The fully qualified file name for a plugin my $plugin; # The bare plugin name without a path or such # Print status information. print "Loading plugins...\n"; # Scan the directory for plugins. while($plugin_file=<$plugin_path/*.pm>) { $plugin_file=~/([^\/]*).pm$/; $plugin=$1; # Print more status information. print " -> $plugin\n"; # Load the plugin into its own namespace. eval "package $plugin;"; require $plugin_file; package main; } } # ============================================================================ # SendLoginForm - Send an HTML form to the client connection requesting the # user ID and password for them to log in. # Input: client - A client connection (HTTP::Daemon::ClientConn) # Output: None sub SendLoginForm { # Parameters my $client=shift; # Client connection (HTTP::Daemon::ClientConn) # Local Variables my $client_request; # Request from the client (HTTP::Request) # We need to get the client requests so as to placate the browser. If we # don't slurp up the HTTP requests, then Netscape generates an error # message saying "connection reset by peer," which gets rather annoying. while ($client_request=$client->get_request) {} # Now send the login form from the file back to the client. $client->send_file_response($login_form_file); } # ============================================================================ # UserLogin - Read the client request and extract the user log in information # from it. Then pass that to the user account server to try to # get the information for that account, if possible. # Input: client - A client connection (HTTP::Daemon::ClientConn) # user_agent - To access the account server (LWP::UserAgent) # Output: User ID of the user at the client end (String) # or undef if they still need to be authenticated or could not. sub UserLogin { # Parameters my $client=shift; # Client connection (HTTP::Daemon::ClientConn) my $user_agent=shift; # User agent to fetch pages (LWP::UserAgent) # Local variables my $login_request; # Request from the client (HTTP::Request) my $server_url; # URL to the account server (URI::URL) my $login_response; # Response from the server (HTTP::Server) my $info; # User information (String) # Get the request from the client. This should contain the information # that we need. Then we create a new URL object to point to the account # server and put it back in the URL. This way we redirect the request to # the account server, but we leave the content alone so that the user # information is left alone. $login_request=$client->get_request; $server_url=new URI::URL($account_server); $login_request->url($server_url); # Now fetch the information from the account server and put it into a # string to work with. $login_response=$user_agent->request($login_request); $info=$login_response->content; # If it was a bad login try, then the server will respond with an empty # page. If we get that, or fail to contact the server, then we need to # send back a page saying that the login has failed. We also return an # undef to indicate that the login failed. if (($info eq "")||$login_response->is_error) { $client->send_file_response($bad_login_page); return undef; } # Otherwise, the login is all right. So we return a page saying that # the user has successfully logged in. We stuff their account info # into the user information hash and then return their user id. else { $client->send_file_response($ok_login_page); $info=~/(.*?)<\/USERID>/; $user_info{$1}=$info; return $1; } } # ============================================================================ # GetUser - Look to see if there is a current entry in the hash of IP # addresses for a given connection. If there is not, then we # bounce back a form asking for the user to login. # Input: client - A client connection (HTTP::Daemon::ClientConn) # user_agent - To access the account server (LWP::UserAgent) # Output: User ID of the user at the client end (String) # or undef if we're still doing authentication stuff. sub GetUser { # Parameters my $client=shift; # Client connection (HTTP::Daemon::ClientConn) my $user_agent=shift; # User agent to fetch pages (LWP::UserAgent) # Local Variables my $status; # Status of the client (string) my $last_time; # Time of last activity from client (integer) my $user_id; # User id # Put the connection status info into an easier variable. $status=$connection_data{$client->peerhost}{"status"}; # If the user is in the loging in state, then the info that they send # is going to contain the user name and password for them. So we need to # process this if ($status eq "logingin") { $user_id=UserLogin($client,$user_agent); if ($user_id) { $connection_data{$client->peerhost}{"time"}=time; $connection_data{$client->peerhost}{"user"}=$user_id; $connection_data{$client->peerhost}{"status"}="loggedin"; } return undef; } # Look up the last access time. If there hasn't been a connection from # that address before or the last activity was longer than the time out # time then tell them that they will have to login in again. We also # give them the logging in status so when the click the button, we catch # their data. $last_time=$connection_data{$client->peerhost}{"time"}; if ((!$status)||((time-$last_time)>$time_out)) { SendLoginForm $client; $connection_data{$client->peerhost}{"status"}="logingin"; return undef; } # Otherwise, we can update their last access time entry and return the # user ID associated with that connection. else { $connection_data{$client->peerhost}{"time"}=time; return $connection_data{$client->peerhost}{"user"}; } } # ============================================================================ # PluginList - This extract the list of plugins from the user info and then # applies the exclusion rules associated with the page to the # list to remove all of the plugins that should be excluded. # The returned list is the list of plugins that are approved # to be run on this page by the user. # Input: user - The ID of the user for this request (String) # user_agent - User agent to fetch rules (LWP::UserAgent) # client_request - Request from the client (HTTP::Request) # server_response - The response from the server (HTTP::Response) # Output: None sub PluginList { # Parameters my $user=shift; # ID of the user for this request (String) my $user_agent=shift; # User agent to fetch rules (LWP::UserAgent) my $client_request=shift; # Request from the client (HTTP::Request) my $server_response=shift; # The response from the server (HTTP::Response) # Local Variables my $headers; # Header from the response (HTTP::Headers) my $content_type; # The content-type from the header (String) my $html; # The HTML contents of the page (String) my $url; # The URL of the request (URI::URL) my $rules_request; # A request for the rules file (HTTP::Request) my $rules_response; # Rules returned by the server (HTTP::Response) my @rule_list; # List of rules controlling plugins (Array) my $rule; # One of the rules in the list (String) my @plugin_list; # List of plugins in order to apply (Array) my $plugin; # One of the plugins in the list (String) my @group_list; # List of groups this user is in (Array) my $group; # One of the groups in the list my $path_name; # The directory path from the URL (String) my $state; # The state to set a plugin to (String) my $pattern; # A rules pattern to try to match (String) my $line; # A line to try to match by a rule (String) my %plugin_state; # On/off state for each plugin (Hash) # Is this an HTML page? If so, we scan through the HTML on the page and # look for meta tags that correspond to Ravel rules. Then we extract the # rule and push it onto an array. $headers=$server_response->headers; $content_type=$headers->header("content-type"); if ($content_type eq "text/html") { $html=$server_response->content; while($html=~/<\s*meta\s+name\s*=\s*"?Ravel"?\s+content\s*=\s*"?(.*?)"?\s*>/gi) { push(@rule_list,$1); } } # Now try to get the rules from a ravel.txt file in the root directory on # the server. If there is such a file, we add each line to the list of # rules. $url=$client_request->url; # FIXME: QUIT BANGING FOR NONEXISTENT FILES ON THE SERVERS WHILE WE TEST # $rules_request=new HTTP::Request("GET","http://".$url->host."/ravel.txt"); # $rules_response=$user_agent->request($rules_request); # if ($rules_response->is_success) # { # $html=$rules_response->content; # push(@rule_list,split(/\n/,$html)); # } # Extract the list of plugins from the user info. $user_info{$user}=~/(.*?)<\/PLUGINLIST>/; @plugin_list=split(/,/,$1); # Extract the list of groups the user is in. $user_info{$user}=~/(.*?)<\/GROUPLIST>/; @group_list=split(/,/,$1); # Extract the directory and file path from the request URL. $path_name=$url->path; # Now for each of the rules, we apply extract the state that should be set # and the pattern to match to put a plugin into that state. Then we # construct lines describing each possible group and plugin and try to see # if we have a match. If we do, then for the time being, we set the plugin # to whatever state was described for the time being foreach $rule (@rule_list) { $rule=~/(.*?):\s*(.*)/; $state=$1; $pattern=$2; foreach $plugin (@plugin_list) { foreach $group (@group_list) { $line="$user-$group-$plugin-$path_name"; if ($line=~/$pattern/i) { $plugin_state{$plugin}=$state; last; } } } } # Finally, we weed out all of the plugins in the list with entries in the # state hash that say "disallow". We finish by returning this culled list # of plugins to use. @plugin_list=grep $plugin_state{$_}ne"disallow", @plugin_list; return @plugin_list; } # ============================================================================ # ApplyPlugins - For each client request, this is called with the name of the # user, the client's request and the server's response. It is # free to do with them as it wishes. Namely, it can invoke # methods on the response to change the content before the # server returns them. # Input: user - The ID of the user for this request (String) # user_agent - User agent to fetch rules (LWP::UserAgent) # client_request - Request from the client (HTTP::Request) # server_response - The response from the server (HTTP::Response) # Output: None sub ApplyPlugins { # Parameters my $user=shift; # ID of the user for this request (String) my $user_agent=shift; # User agent to fetch rules (LWP::UserAgent) my $client_request=shift; # Request from the client (HTTP::Request) my $server_response=shift; # The response from the server (HTTP::Response) # Local Variables my @plugin_list; # The plugins to apply in list format. my $plugin; # The current plugin to work on # Grab the list of plugins to apply. The list is returned in the order # to apply them and with all of the rules in the HTML and from the server # about what plugins to allow taken into account. @plugin_list=PluginList($user,$user_agent,$client_request,$server_response); # Loop for each of the plugins in the list and in order and execute # the plugin on the data. We pass the ID of the user, the client's # request, and the server's response. The plugin can invoke any # methods that it likes to alter the server response. This object will # get returned to the client. foreach $plugin (@plugin_list) { {$plugin}::Execute($user,$user_info{$user},$client_request,$server_response); } } # ============================================================================ # ProcessClient - Given a client, this will do all of the things nescessary # to handle it. One of the key things that it will do is to # fork of a child process to handle it. # Input: client - A client connection (HTTP::Daemon::ClientConn) # user_agent - User agent used to fetch pages (LWP::UserAgent) # Output: None sub ProcessClient { # Parameters my $client=shift; # Client connection (HTTP::Daemon::ClientConn) my $user_agent=shift; # User agent to fetch pages (LWP::UserAgent) # Local Variables my $pid; # Process ID from fork call (Integer) my $client_request; # Request from the client (HTTP::Request) my $server_response; # The response from the server (HTTP::Response) my $user; # The ID of the user (String) # Get the user ID associated with this client. If we don't get one back # then close the connection and return; they've been served by the login # routines. if (!($user=GetUser($client,$user_agent))) { close $client; return; } # Fork off a child process to handle it. The parent will return to the # main loop to recieve more request. $pid=fork; if ($pid!=0) { close $client; return; } # Keep fetching client requests while there are any to be had. while ($client_request=$client->get_request) { # Pass the request to the server to try to get a response. We use the # simple_request method instead of the request method, because we the # request method automatically intercepts redirect responses and # requests the page the redirect points to. The browser never catches # the redirect and so its expansion of relative URLs gets screwed up. # So we use the simple_request method because it leaves handling of # the redirects to the browser. Lesson learned. $server_response=$user_agent->simple_request($client_request); # Now run it through the plugins. ApplyPlugins($user,$user_agent,$client_request,$server_response); # Pass the response back to the client. $client->send_response($server_response); } # We're done, so close the connection with the client and exit. close $client; exit(0); } # ============================================================================ # MainLoop - This is the main server loop. It sets itself up to listen on the # given port for client connections and calls ProcessRequest to # deal with each one of them. # Input: None # Output: None sub MainLoop { # Local Variables my $user_agent; # User agent to fetch pages (LWP::UserAgent) my $server_daemon; # Server daemon object (HTTP::Daemon) my $client; # Client connection (HTTP::Daemon::ClientConn) # Print out status information. print "Starting up server...\n"; # When a child exits, it sends the CHLD signal to the parent process. # But we need to properly acknowledge this in order for the child to be # able to exit properly. UNIX is just weird about this. So we install # our signal handler and this theoretically prevents zombie child # processes, which is a Good Thing, considering the resource consumption # of Perl and the limited process table. We could use an anonymous # subroutine, but if this is defensive against SysV UNIXes. $SIG{CHLD} = \&Reaper; # Set up a user agent for the child processes to use to fetch the pages # from the servers. This way it will be ready for the child processes # to use when they need it. $user_agent=new LWP::UserAgent; $user_agent->agent("Ravel/0.09"); # If the user wants this to chain through another proxy, they can load # the environment variables, which this will retrieve and use to set # itself to go through the given proxies. $user_agent->env_proxy if $use_proxy; # Create a daemon object to list for incoming client request. We'll set # it to listen on the given port. $server_daemon=new HTTP::Daemon LocalAddr=>$server_addr, LocalPort=>$server_port; # Tell the user that we are successfully running. print "Server active at ".$server_daemon->url."\n"; # Here is the main loop. We block on accept and wait for client # connections. For each one, we call ProcessRequest to handle it. while($client=$server_daemon->accept) { ProcessClient($client,$user_agent); } } # ============================================================================ # Main Sequence DisplayBanner; # Display our banner to the user. GetConfig; # Get the configuration information. InitDaemonMode if $daemon_mode; # Go into the background daemon style if desired. LoadPlugins; # Load up all of the plugins MainLoop; # Execute the server loop.