Article 1597 of comp.infosystems.www: Xref: feenix.metronet.com comp.infosystems.www:1597 Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!math.ohio-state.edu!cs.utexas.edu!geraldo.cc.utexas.edu!ansel.cc.utexas.edu!not-for-mail From: zippy@ansel.cc.utexas.edu (Jack Lund) Newsgroups: comp.infosystems.www Subject: Re: How to get gopher files with perl? Date: 14 Sep 1993 13:42:50 -0500 Organization: The University of Texas - Austin Lines: 711 Message-ID: <2753ba$a2m@ansel.cc.utexas.edu> References: <1993Sep14.163827.7182@news.unige.ch> NNTP-Posting-Host: ansel.cc.utexas.edu In article <1993Sep14.163827.7182@news.unige.ch>, Oscar Nierstrasz wrote: > >I want to get a large number of gopher files using a shell/perl script >(actually I don't want to -- Simon Gibbs next door wants to). > >The gopher protocol looked simple, so I thought it would be easy. >It seems to work, but the files returned are short if they are image files (GIF). >What am I doing wrong? Are there any good pointers to on-line doc? >(I have seen the gopher protocol.rtf at cern, but it seems to be >somewhat out of date ...) > >I thought all you had to do was connect, send the name of the file you >want, and gobble up the reply. That doesn't seem to work. >Are image files compressed or otherwise encoded? > >Here is the perl subroutine I am using: I modified a perl script of yours (remember hget?) to take (just about) any URL and grab the appropriate document. It works well for binary & ascii files. You might check it out. I haven't had a chance to take a close look at your code, but if I discover anything, I'll let you know. -------------------------url_get------------------------------------ #!/bin/perl # # url_get --- get a document given a WWW URL # # Jack Lund 9/3/93 zippy@ccwf.cc.utexas.edu # # from hget by: # Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch # $home = $ENV{"HOME"}; require "chat2.pl"; require "$home/lib/perl/URL.pl"; require "$home/lib/perl/ftplib.pl"; require "getopts.pl"; &Getopts(':b'); die "Usage: url_get ...\n" unless $#ARGV >= 0; $timeout = undef; foreach $url (@ARGV) { ($protocol, $host, $port, $rest1, $rest2, $rest3) = &url'parse_url($url); foo: { if ($protocol eq "http") { &http_get($host,$port,$rest1); last foo; } if ($protocol eq "gopher") { &gopher_get($host, $port, $rest1, $rest2, $rest3); last foo; } if ($protocol eq "file") { &file_get($host, $port, $rest1); last foo; } if ($protocol eq "news") { &news_get($host, $port, $rest1); last foo; } die "Protocol $protocol not supported!\n"; } } sub http_get { local($host,$port,$request) = @_; ($handle = &chat'open_port($host, $port)) || die "chat'open($host,$port): $!\n"; &chat'print($handle,"GET $request\n") || die "chat'print(GET $request): $!\n"; *S = *chat'S; while () { print "$_"; } &chat'close($handle); } sub gopher_get { local($host,$port,$gtype,$selector,$search) = @_; $request = ($search ? "$selector\t$search" : $selector); ($handle = &chat'open_port($host, $port)) || die "chat'open($host,$port): $!\n"; &chat'print($handle,"$request\n") || die "chat'print($request): $!\n"; *S = *chat'S; # If this is a binary document, retreive it using sysreads rather # than if ($gtype eq '5' || $gtype eq '9' || $gtype eq 'I') { $done = 0; $rmask = ""; vec($rmask,fileno(S),1) = 1; do { ($nfound, $rmask) = select($rmask, undef, undef, $timeout); if ($nfound) { $nread = sysread(S, $thisbuf, 1024); if ($nread > 0) { syswrite(STDOUT, $thisbuf, $nread) || die "Syswrite: $!\n"; } else { $done++; } } else { warn "Timeout\n"; $done++; } } until $done; } # This is an ASCII document, and we can get it line-by-line using else { while () { last if (/^\.\r\n$/); chop; chop; print "$_\n"; } } &chat'close($handle); } sub file_get { local($host, $port, $path) = @_; $localhost = `hostname`; if ($host eq $localhost && !defined($port)) { open(IN, $path) || die "$path: $!\n"; while () { print "$_\n"; } close(IN); } else { &ftp'open($host) || die "Unable to open ftp connection to $host\n"; &ftp'get($path, "&STDOUT") || die "Unable to get file $path from $host\n"; &ftp'close; } } sub news_get { local($host, $port, $article) = @_; ($handle = &chat'open_port($host, $port)) || die "chat'open($host,$port): $!\n"; if ($article =~ /^[^<].+@.+[^>]$/) { $request = "article <$article>"; } elsif ($article =~ /^<.+@.+>$/) { $request = "article $article"; } elsif ($article =~ /^\*$/) { die "Only support URLs of the form: news:article\n"; } elsif ($article) { die "Only support URLs of the form: news:article\n"; } else { die "Bad url\n"; } # Read NNTP Connect message *S = *chat'S; $string = ; $string =~ /^(\d*) (.*)$/; die "NNTP Error: $2\n" unless ($1 eq '200'); # Send request &chat'print($handle,"$request\r\n") || die "chat'print($request): $!\n"; # Read reply message $string = ; $string =~ /^(\d*) (.*)$/; die "NNTP Error: $2\n" unless ($1 eq '220'); # Get article while () { last if (/^\.\r\n$/); chop; chop; print "$_\n"; } &chat'print($handle,"quit\n") || die "chat'print(quit): $!\n"; &chat'close($handle); } __END__ -------------------------URL.pl--------------------------------------- # # URL.pl - package to parse WWW URLs # # Jack Lund 9/3/93 zippy@ccwf.cc.utexas.edu # package url; # Default port numbers for URL services $ftp_port = 21; $http_port = 80; $gopher_port = 70; $telnet_port = 23; $wais_port = 210; $news_port = 119; # syntax: &url'parse_url(URL) # returns array containing following: # protocol protocol string from url. ex: "gopher", "http". # host host that specified protocol server is running on # port port that server answers on # the rest of the array is protocol-dependant. See code for details. # sub parse_url { local($url) = @_; if ($url =~ m#^(\w+):#) { $1 =~ s/[A-Z]/[a-z]/g; $protocol = $1; } else { return undef; } if ($protocol eq "file" || $protocol eq "ftp") { # URL of type: file://hostname[:port]/path if ($url =~ m#^\s*\w+://([^ \t/:]+):?(\d*)(/.*)$#) { $1 =~ s/[A-Z]/[a-z]/; $host = $1; $port = ($2 ne "" ? $2 : $ftp_port); $path = $3; return ($protocol, $host, $port, $path); } # URL of type: file:/path if ($url =~ m#^\s*\w+:(/.*)$#) { $host = `hostname`; # Current host $port = undef; return ($protocol, $host, $port, $1); } return undef; } if ($protocol eq "news") { # URL of type: news://host[:port]/article if ($url =~ m#^\s*\w+://([^ \t:/]):?(\d*)/(.*)$#) { $host = $1; $port = ($2 ne "" ? $2 : $news_port); $selector = $3; } # URL of type: news:article elsif ($url =~ m#^\s*\w+:(.*)$#) { $host = $ENV{"NNTPSERVER"}; unless ($host) { warn "Couldn't get NNTP server name\n"; return undef; } $port = $news_port; $selector = $1; } else { return undef; } return ($protocol, $host, $port, $selector); } # URL of type: http://host[:port]/path[?search-string] if ($protocol eq "http") { if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)(/[^ \t\?]+)\??(.)*$#) { $1 =~ s/[A-Z]/[a-z]/g; $server = $1; $port = ($2 ne "" ? $2 : $http_port); $path = $3; $search = $4; return ($protocol, $server, $port, $path, $search); } return undef; } # URL of type: telnet://user@host[:port] if ($protocol eq "telnet") { if ($url =~ m#^\s*\w+://([^@]+)@([^: \t]+):?(\d*)$#) { $user = $1; $2 =~ s/[A-Z]/[a-z]/g; $host = $2; $port = (defined($3) ? $3 : $telnet_port); return($protocol, $host, $port, $user); } # URL of type: telnet://host[:port] if ($url =~ m#^\s*\w+://([^: \t]+):?(\d*)$#) { $1 =~ s/[A-Z]/[a-z]/g; $host = $1; $port = (defined($2) ? $2 : $telnet_port); return($protocol, $host, $port); } return undef; } # URL of type: gopher://host[:port]/[gtype]selector-string[?search-string] if ($protocol eq "gopher") { if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)/(\w?)([^ \t\?]*)\??(.*)$#) { $1 =~ s/[A-Z]/[a-z]/g; $server = $1; $port = ($2 ne "" ? $2 : $gopher_port); $gtype = ($3 ne "" ? $3 : 1); $selector = $4; $search = $5; return ($protocol, $server, $port, $gtype, $selector, $search); } return undef; } # URL of type: wais://host[:port]/database?search-string if ($protocol eq "wais") { if ($url =~ m#^\s\w+://([\w\d\.]+):?(\d*)/([^\?]+)\??(.*)$#) { $1 =~ s/[A-Z]/[a-z]/g; $server = $1; $port = (defined($2) ? $2 : $wais_port); $database = $3; $search = $4; return ($protocol, $server, $port, $database, $search); } return undef; } } -------------------------ftplib.pl--------------------------------------- # # This is a set of ftp library routines using chat2.pl # # Return code information taken from RFC 959 # Written by Gene Spafford # Last update: 10 April 92, Version 0.9 # # # Most of these routines communicate over an open ftp channel # The channel is opened with the "ftp'open" call. # package ftp; require "chat2.pl"; require "syscall.ph"; ########################################################################### # # The following are the variables local to this package. # I declare them all up front so I can remember what I called 'em. :-) # ########################################################################### LOCAL_VARS: { $Control; $Data_handle; $Host; $Myhost = "\0" x 65; (syscall(&SYS_gethostname, $Myhost, 65) == 0) || die "Cannot 'gethostname' of local machine (in ftplib)\n"; $Myhost =~ s/\0*$//; $NeedsCleanup; $NeedsClose; $ftp_error; $ftp_matched; $ftp_trans_flag; @ftp_list; local(@tmp) = getservbyname("ftp", "tcp"); ($FTP = $tmp[2]) || die "Unable to get service number for 'ftp' (in ftplib)!\n"; @std_actions = ( 'TIMEOUT', q($ftp_error = "Connection timed out for $Host!\n"; undef), 'EOF', q($ftp_error = "Connection to $Host timed out unexpectedly!\n"; undef) ); @sigs = ('INT', 'HUP', 'TERM', 'QUIT'); # sigs we'll catch & terminate on } ########################################################################### # # The following are intended to be the user-callable routines. # Each of these does one of the ftp keyword functions. # ########################################################################### sub error { ## Public $ftp_error; } ####################################################### # cd up a directory level sub cdup { ## Public &do_ftp_cmd(200, "cdup"); } ####################################################### # close an open ftp connection sub close { ## Public return unless $NeedsClose; &do_ftp_cmd(221, "quit"); &chat'close($Control); undef $NeedsClose; &do_ftp_signals(0); } ####################################################### # change remote directory sub cwd { ## Public &do_ftp_cmd(250, "cwd", @_); } ####################################################### # delete a remote file sub delete { ## Public &do_ftp_cmd(250, "dele", @_); } ####################################################### # get a directory listing of remote directory ("ls -l") sub dir { ## Public &do_ftp_listing("list", @_); } ####################################################### # get a remote file to a local file # get(remote[, local]) sub get { ## Public local($remote, $local) = @_; ($local = $remote) unless $local; unless (open(DFILE, ">$local")) { $ftp_error = "Open of local file $local failed: $!"; return undef; } else { $NeedsCleanup = $local; } return undef unless &do_open_dport; # Open a data channel unless (&do_ftp_cmd(150, "retr $remote")) { $ftp_error .= "\nFile $remote not fetched from $Host\n"; close DFILE; unlink $local; undef $NeedsCleanup; return; } $ftp_trans_flag = 0; do { &chat'expect($Data_handle, 60, '.|\n', q{print DFILE ($chat'thisbuf) || ($ftp_trans_flag = 3); undef $chat'S}, 'EOF', '$ftp_trans_flag = 1', 'TIMEOUT', '$ftp_trans_flag = 2'); } until $ftp_trans_flag; close DFILE; &chat'close($Data_handle); # Close the data channel undef $NeedsCleanup; if ($ftp_trans_flag > 1) { unlink $local; $ftp_error = "Unexpected " . ($ftp_trans_flag == 2 ? "timeout" : ($ftp_trans_flag != 3 ? "failure" : "local write failure")) . " getting $remote\n"; } &do_ftp_cmd(226); } ####################################################### # Do a simple name list ("ls") sub list { ## Public &do_ftp_listing("nlst", @_); } ####################################################### # Make a remote directory sub mkdir { ## Public &do_ftp_cmd(257, "mkd", @_); } ####################################################### # Open an ftp connection to remote host sub open { ## Public if ($NeedsClose) { $ftp_error = "Connection still open to $Host!"; return undef; } $Host = shift(@_); local($User, $Password, $Acct) = @_; $User = "anonymous" unless $User; $Password = "-" . $main'ENV{'USER'} . "@$Myhost" unless $Password; $ftp_error = ''; unless($Control = &chat'open_port($Host, $FTP)) { $ftp_error = "Unable to connect to $Host ftp port: $!"; return undef; } unless(&chat'expect($Control, 60, "^220 .*\n", "1", "^\d\d\d .*\n", "undef")) { $ftp_error = "Error establishing control connection to $Host"; &chat'close($Control); return undef; } &do_ftp_signals($NeedsClose = 1); unless (&do_ftp_cmd(331, "user $User")) { $ftp_error .= "\nUser command failed establishing connection to $Host"; return undef; } unless (&do_ftp_cmd("(230|332|202)", "pass $Password")) { $ftp_error .= "\nPassword command failed establishing connection to $Host"; return undef; } return 1 unless $Acct; unless (&do_ftp_cmd("(230|202)", "pass $Password")) { $ftp_error .= "\nAcct command failed establishing connection to $Host"; return undef; } 1; } ####################################################### # Get name of current remote directory sub pwd { ## Public if (&do_ftp_cmd(257, "pwd")) { $ftp_matched =~ m/^257 (.+)\r?\n/; $1; } else { undef; } } ####################################################### # Rename a remote file sub rename { ## Public local($from, $to) = @_; &do_ftp_cmd(350, "rnfr $from") && &do_ftp_cmd(250, "rnto $to"); } ####################################################### # Set transfer type sub type { ## Public &do_ftp_cmd(200, "type", @_); } ########################################################################### # # The following are intended to be utility routines used only locally. # Users should not call these directly. # ########################################################################### sub do_ftp_cmd { ## Private local($okay, @commands, $val) = @_; $commands[0] && &chat'print($Control, join(" ", @commands), "\r\n"); &chat'expect($Control, 60, "^$okay .*\\n", '$ftp_matched = $&; 1', '^(\d)\d\d .*\\n', '($String = $&) =~ y/\r\n//d; $ftp_error = qq{Unexpected reply for ' . "@commands" . ': $String}; $1 > 3 ? undef : 1', @std_actions ); } ####################################################### sub do_ftp_listing { ## Private local(@lcmd) = @_; @ftp_list = (); $ftp_trans_flag = 0; return undef unless &do_open_dport; return undef unless &do_ftp_cmd(150, @lcmd); do { # Following is grotty, but chat2 makes us do it &chat'expect($Data_handle, 30, "(.*)\r?\n", 'push(@ftp_list, $1)', "EOF", '$ftp_trans_flag = 1'); } until $ftp_trans_flag; &chat'close($Data_handle); return undef unless &do_ftp_cmd(226); grep(y/\r\n//d, @ftp_list); @ftp_list; } ####################################################### sub do_open_dport { ## Private local(@foo, $port) = &chat'open_listen; ($port, $Data_handle) = splice(@foo, 4, 2); unless ($Data_handle) { $ftp_error = "Unable to open data port: $!"; return undef; } push(@foo, $port >> 8, $port & 0xff); local($myhost) = (join(',', @foo)); &do_ftp_cmd(200, "port $myhost"); } ####################################################### # # To cleanup after a problem # sub do_ftp_abort { die unless $NeedsClose; &chat'print($Control, "abor", "\r\n"); &chat'close($Data_handle); &chat'expect($Control, 10, '.', undef); &chat'close($Control); close DFILE; unlink($NeedsCleanup) if $NeedsCleanup; die; } ####################################################### # # To set signals to do the abort properly # sub do_ftp_signals { local($flag, $sig) = @_; local ($old, $new) = ('DEFAULT', "ftp'do_ftp_abort"); $flag || (($old, $new) = ($new, $old)); foreach $sig (@sigs) { ($SIG{$sig} == $old) && ($SIG{$sig} = $new); } } 1; -- Jack Lund Email: zippy@ccwf.cc.utexas.edu Graphics Services Phone: (512) 471-3241 UT Austin Computation Center WWW: Zippy!