#!/usr/bin/ruby # # maven-proxy.rb -- a caching-proxy CGI script for lazily populating # local Maven repositories. # # Program: maven-proxy.rb # Version: 1.0.3 # License: MIT # # Copyright (c) 2005, 2006 Nick Sieger # # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation files # (the "Software"), to deal in the Software without restriction, # including without limitation the rights to use, copy, modify, merge, # publish, distribute, sublicense, and/or sell copies of the Software, # and to permit persons to whom the Software is furnished to do so, # subject to the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # # FEATURES # # - Lazily caches resources from from upstream repositories. # - Does not use local cache for SNAPSHOT resources unless resource is # not found upstream. # - Overrides local cache and refetches from upstream if request URI # contains the query string '?refetch'. # - Proxies but does not cache index (directory listing) content. # This allows for seamless navigation of multiple upstream # directories under a single hierarchy of the local repository. # - For directory listings, reports the upstream location from where # it came. # - Propagates remote redirects (e.g, /res to /res/) to keep child # links in directory listings correct. # - For use with Ruby 1.8.x (built and tested on Ruby 1.8.2 # i386-mswin32) # # OPERATION # # - Serves files from local repository, if they exist, unless: # a. The URI contains the string 'SNAPSHOT' -- snapshots should # always be resolved to the source # b. The query string on the URI is the string 'refetch' # - Checks remote repositories in the order list for the requested # URI. If a remote repository responds, the request is downloaded # and cached in the local repository, except if the request appears # to be for an index page, in which case the content is proxied but # not cached. # - Responds with 404 if request is not cached and no upstream # repositories have the resource. # # CONFIGURATION # # 1. Check (and set if necessary) the path to your Ruby 1.8 executable # in the #! line above. # 2. Set configuration variables below according to your installation. # LOCAL -- full path to local repository location # REMOTE -- space-separated list of remote repository urls # (inside the %w{}) # ALIAS -- alias/leading URI to repository in the local webserver # (optional; comment out to just use the CGI script path) # MIME_TYPES -- path to Apache-style mime.types file for additional # types (optional) # LOG_FILE -- path to log file (optional); comment out to disable logging # # APACHE # # If you're using this CGI inside Apache, a recommended way to use the # proxy is to let Apache serve up cached files and directories and # only defer to the CGI for SNAPSHOTs, missing and 'refetch'-ed files. # You can do this with rewrite rules as follows. # # ScriptAlias /cgi/ "/path/to/your/cgi/" # # Options ExecCGI # AddHandler cgi-script .rb # Allow from all # Order allow,deny # # # Alias /repos/ # # Options Indexes MultiViews FollowSymLinks # AllowOverride None # Allow from all # Order allow,deny # RewriteEngine on # RewriteBase /repos/ # # Always defer to cgi when URI is a file containing 'SNAPSHOT' # RewriteCond %{REQUEST_FILENAME} !-d # RewriteCond %{REQUEST_URI} SNAPSHOT # RewriteRule ^(.*)$ /cgi/maven-proxy.rb/$1 [QSA,L] # # Always defer to cgi when query string == 'refetch' # RewriteCond %{QUERY_STRING} ^refetch$ # RewriteRule ^(.*)$ /cgi/maven-proxy.rb/$1 [QSA,L] # # Defer to cgi for missing files and directories # RewriteCond %{REQUEST_FILENAME} !-d # RewriteCond %{REQUEST_FILENAME} !-f # RewriteRule ^(.*)$ /cgi/maven-proxy.rb/$1 [QSA,L] # # # VERSION HISTORY # # Version: 1.0.3 [02/23/06 10:02 NJS] # - Fix local resolution of SNAPSHOT resources that don't exist on in # the remote repositories # - Additional rewrite rule configuration documented # # Version: 1.0.2 [11/28/05 09:28 NJS] # - Add doco covering RewriteRule for SNAPSHOTs # # Version: 1.0.1 [11/27/05 01:30 NJS] # - Refactor tests into multiple methods # - Introduce strange commons-httpclient test that fails on some # servers for me but not my local setup # # Version: 1.0 [11/26/05 16:46 NJS] # - Initial release # ==== Configuration section ==== LOCAL = 'c:/projects/ruby/maven-repos' REMOTE = %w{http://www.ibiblio.org/maven2} ALIAS = '/repos' MIME_TYPES = 'c:/tools/apache/Apache/conf/mime.types' # LOG_FILE = 'c:/tools/apache/Apache/logs/maven-proxy.log' # Set umask for the CGI process (if desired) File.umask(0002) # ==== Program below here -- no configuration needed ==== require 'cgi' require 'webrick/httputils' require 'net/http' require 'uri' require 'fileutils' require 'logger' if defined?(LOG_FILE) && !LOG_FILE.nil? && LOG_FILE.length > 0 LOGGER = Logger.new(LOG_FILE) else require 'stringio' LOGGER = Logger.new(StringIO.new) end class MimeLookup include WEBrick::HTTPUtils def initialize() @mime_types = {} @mime_types.update(WEBrick::HTTPUtils::DefaultMimeTypes) @mime_types.update(load_mime_types(MIME_TYPES)) if defined?(MIME_TYPES) @mime_types.update({"md5" => "text/plain", "sha1" => "text/plain", "pom" => "text/xml"}) end def mime_type(filename) super(filename, @mime_types) end end MimeTypes = MimeLookup.new module Repository def initialize @local = LOCAL @local.sub!(%r{/$}, '') end def make_path(cgi, path_info = nil) path_info = cgi.path_info if path_info.nil? path = '' if defined?(ALIAS) && ALIAS path = "#{ALIAS}" else path = "#{cgi.script_name}" end path += "#{path_info}" path += "?#{cgi.query_string}" if cgi.query_string && cgi.query_string.length > 0 path end def filename(path) File.join(@local, path.sub(%r{^/}, '')) end def stream(cgi) fn = filename(cgi.path_info) if File.exist?(fn) && File.readable?(fn) File.open(fn, 'rb') do |f| cgi.out("status" => "200", "type" => MimeTypes.mime_type(fn), "length" => File.size(fn)) { f.read } end else raise RuntimeError, "Unable to read file" end end end class LocalSnapshotRepository include Repository def refetch?(cgi) false end def resolve?(cgi) fn = filename(cgi.path_info) exist = !refetch?(cgi) && File.exist?(fn) && !File.directory?(fn) LOGGER.info "found at #{fn}" if exist exist end def to_s @local end end class LocalCacheRepository < LocalSnapshotRepository # Refetch file from remote repository either when it is a snapshot # or when there is a '?refetch' on the end of the URL. def refetch?(cgi) cgi.path_info =~ /SNAPSHOT/ || ((cgi.query_string || "") == "refetch") end end class RemoteRepository include Repository def initialize(remote) super() @remote = URI.parse(remote.sub(%r{/$}, '')) end def resolve?(cgi) path = cgi.path_info response = Net::HTTP.start(@remote.host, @remote.port) do |http| http.get("#{@remote.path}#{path}") end case response when Net::HTTPSuccess LOGGER.info "found at #{@remote}, downloading" # Don't download and cache HTML file for URIs that correspond to index.html if response['Content-Type'] =~ %r{text/html} && (path !~ /\.html$/ || path =~ %r{/index.html$}) @contents = response.body else fn = filename(path) FileUtils.mkdir_p(File.dirname(fn)) File.open(fn, "wb") do |dest| dest << response.body end end when Net::HTTPRedirection LOGGER.info "redirect: #{response['location']}" tmpuri = URI.parse(response['location']) if tmpuri.host == @remote.host && tmpuri.port == @remote.port newpath = tmpuri.path.sub(/^#{@remote.path}/, '') cgi.out("status" => "#{response.code} #{response.message}", "Location" => make_path(cgi, newpath)) { "Redirect" } end else LOGGER.info "not found, received #{response.inspect}" false end end def stream(cgi) if defined?(@contents) path = cgi.path_info cgi.out("status" => "200") do "These contents are from the upstream location #{@remote}#{path}.

#{@contents}" end else super end end def to_s @remote.to_s end end class Main def initialize @cgi = CGI.new end def validate_config raise ArgumentError, "Invalid local repository path" unless File.exist? LOCAL end def fetch? resolved = false LOGGER.info "fetching #{@cgi.path_info}" repositories = [LocalCacheRepository.new, REMOTE.map{|r| RemoteRepository.new(r)}, LocalSnapshotRepository.new].flatten repositories.each do |repos| LOGGER.info "checking repos: #{repos.to_s}" if repos.resolve?(@cgi) repos.stream @cgi resolved = true break end end resolved end def main begin validate_config unless fetch? @cgi.out("status" => "404 Not Found") do "

File not found

#{@cgi.path_info} was not found upstream in any of:

#{REMOTE.join('
')}" end end rescue Exception => e LOGGER.error(e.backtrace.join("\n")) CGI.new.out("status" => "500 Internal Server Error") do "

Error executing script

#{e.to_s}

" end end end end # Only launch CGI in online mode, otherwise run tests if ENV['REQUEST_METHOD'] Main.new.main else require 'test/unit' require 'fileutils' # Hostname and port where proxy script is running HOSTNAME = 'localhost' PORT = 80 module ProxyTestHelper def initialize(*args) super @local = LOCAL @hostname = HOSTNAME @port = PORT @script_name = ALIAS end def rm_rf(dir) FileUtils.rm_rf File.join(@local, dir) end def current_time(path) File.stat(File.join(@local, path)).mtime end def get(path) resp = Net::HTTP.start(@hostname, @port) {|http| http.get("#{@script_name}/#{path}")} end def contains(expr) proc {|x| x =~ /#{Regexp.escape expr}/} end def does_not_contain(expr) proc {|x| x !~ /#{Regexp.escape expr}/} end def assert_redirect_to(place, resp) assert_kind_of Net::HTTPRedirection, resp, "Not redirected" to_match = resp['location'] case place when Proc assert place.call(to_match), "Location '#{to_match}' doesn't match" else assert_match place.to_s, to_match, "Location '#{to_match}' doesn't match" end end def assert_body(expr, resp) assert_kind_of Net::HTTPSuccess, resp, "Request failed" to_match = resp.body case expr when Proc assert expr.call(to_match), "Body doesn't match" else assert_match expr.to_s, to_match, "Body doesn't match" end end def assert_header(hash, resp) assert_kind_of Net::HTTPSuccess, resp, "Request failed" hash.each do |key, val| assert_equal val, resp[key] end end def assert_cached(path) assert File.exist?(File.join(@local, path)), "File '#{path}' not cached locally" end end class MavenProxyTest < Test::Unit::TestCase include ProxyTestHelper def setup rm_rf('org/apache/maven') end def test_redirects_from_remote_server_are_propagated assert_redirect_to contains('org/apache/maven/'), get('org/apache/maven') assert_body contains('contents are from the upstream location'), get('org/apache/maven/') # refetch -- should be from remote assert_redirect_to contains('org/apache/maven/?refetch'), get('org/apache/maven?refetch') assert_body contains('contents are from the upstream location'), get('org/apache/maven/?refetch') end def test_check_for_proper_content_type_headers assert_header({'content-type' => 'text/xml'}, get('org/apache/maven/maven/maven-metadata.xml')) assert_cached 'org/apache/maven/maven/maven-metadata.xml' assert_header({'content-type' => 'text/plain'}, get('org/apache/maven/maven/maven-metadata.xml.md5')) assert_cached 'org/apache/maven/maven/maven-metadata.xml.md5' assert_header({'content-type' => 'text/plain'}, get('org/apache/maven/maven/maven-metadata.xml.sha1')) assert_cached 'org/apache/maven/maven/maven-metadata.xml.sha1' assert_header({'content-type' => 'application/octet-stream'}, get('org/apache/maven/maven-plugin-api/2.0/maven-plugin-api-2.0.jar')) assert_cached 'org/apache/maven/maven-plugin-api/2.0/maven-plugin-api-2.0.jar' end def test_refetch_pom # check that this is really a POM assert_body contains('org.apache.maven'), get('org/apache/maven/maven/2.0/maven-2.0.pom') assert_cached 'org/apache/maven/maven/2.0/maven-2.0.pom' # Now refetch and ensure that it was redownloaded time = current_time('org/apache/maven/maven/2.0/maven-2.0.pom') sleep 1 assert_body contains('org.apache.maven'), get('org/apache/maven/maven/2.0/maven-2.0.pom?refetch') assert current_time('org/apache/maven/maven/2.0/maven-2.0.pom') > time end def test_not_found_is_propagated # Ensure 404 is propagated assert_kind_of Net::HTTPClientError, get('org/apache/maven/maven/2.0/zzz-2.0.pom') end # This test assumes an Apache configuration similar to the one # mentioned above where Apache is used to serve the cached files # (and thus directory listings) -- the assertions that test that # local directory listings don't have the "from the upstream # location" notice in them will fail in that case. def test_local_directory_index assert_header({'content-type' => 'application/octet-stream'}, get('org/apache/maven/maven-plugin-api/2.0/maven-plugin-api-2.0.jar')) assert_cached 'org/apache/maven/maven-plugin-api/2.0/maven-plugin-api-2.0.jar' # directory should be cached locally now assert_cached 'org/apache/maven' assert_body does_not_contain('contents are from the upstream location'), get('org/apache/maven/') end def test_commons_httpclient_confusing_pom_and_md5_files rm_rf 'commons-httpclient/commons-httpclient/3.0-rc3/commons-httpclient-3.0-rc3.pom' assert_header({'content-type' => 'text/plain'}, get('commons-httpclient/commons-httpclient/3.0-rc3/commons-httpclient-3.0-rc3.pom.md5')) assert_cached 'commons-httpclient/commons-httpclient/3.0-rc3/commons-httpclient-3.0-rc3.pom.md5' assert_body contains('commons-httpclient'), get('commons-httpclient/commons-httpclient/3.0-rc3/commons-httpclient-3.0-rc3.pom') assert_cached 'commons-httpclient/commons-httpclient/3.0-rc3/commons-httpclient-3.0-rc3.pom' end end end