#!/usr/bin/perl use warnings; use strict; use Getopt::Std qw(getopts); use HTML::Entities qw(decode_entities); use LWP::Simple (); use POSIX qw(strftime); use URI; our $VERSION = "0.000"; my $top_uri = "http://search.cpan.org/modlist/"; sub get($) { my($uri) = @_; my $content = LWP::Simple::get($uri); die "failed to download $uri\n" unless defined $content; $content; } my $top_page = get($top_uri); my %opts; unless(getopts("p", \%opts)) { print STDERR "$0: usage: $0 [-p]\n"; exit 2; } my $start_time = time; my @categories = (); while($top_page =~ m{([^<]+)}g) { push @categories, { reluri => $1, name => $2, }; } @categories = sort { $a->{name} cmp $b->{name} } @categories; sub parse_row($) { my($row) = @_; my @cells = (); while($row =~ m{>\s*([^\s<][^<]*)<}g) { my $content = decode_entities($1); $content =~ tr/\xa0/ /; push @cells, $content; } unless(@cells == 4) { warn "bad table row >>>>$row<<<<"; return undef; } my $module = { name => $cells[0], dslip => $cells[1], description => $cells[2], info => $cells[3], }; unless($module->{name} =~ /^((?:.*::)?)(\w+)$/) { warn "bad module name `$module->{name}'"; return undef; } $module->{namespace} = $1; $module->{leafname} = $2; $module; } my $chapter = "a"; foreach my $category (@categories) { $category->{chapter} = $chapter++; $category->{modules} = {}; my @uris_to_do = $category->{reluri}; my %uris_done = (); while(@uris_to_do) { my $reluri = pop @uris_to_do; print STDERR "examining $reluri...\n" if $opts{p}; $uris_done{$reluri} = undef; my $page = get(URI->new_abs($reluri, $top_uri)); while($page =~ m{([^<]+)}g) { push @uris_to_do, $1 unless exists $uris_done{$1}; } while($page =~ m{(.*?)}sg) { my $module = parse_row($1); next unless defined $module; $category->{modules}->{$module->{name}} = $module; } } } print "=" x 71, "\n\n", "Perl 5 module list\n\n", "Scraped from <", $top_uri, ">,\nstarting ", strftime("%Y-%m-%dT%H:%M:%SZ", gmtime($start_time)), ".\n\n", "scrapemodlist version ", $VERSION, "\n\n", "_" x 71, "\n\n", "CONTENTS\n\n"; foreach my $category (@categories) { printf "%-3s %s\n", $category->{chapter}.")", $category->{name}; } foreach my $category (@categories) { print "\n", "_" x 71, "\n\n", $category->{chapter}, ") ", $category->{name}, "\n\n", "Name DSLIP Description", " Info\n", "------------ ----- -----------", "--------------------------------- ----"; my $ns = "0"; foreach my $module (sort { $a->{namespace} cmp $b->{namespace} || $a->{leafname} cmp $b->{leafname} } values %{$category->{modules}}) { if($module->{namespace} ne $ns) { $ns = $module->{namespace}; print "\n"; print $ns, "\n" if $ns; } printf "%-18s %-5s %-44s %s\n", ($ns ? "::" : "").$module->{leafname}, $module->{dslip}, $module->{description}, $module->{info}; } } print "\n", "=" x 71, "\n"; exit 0; =head1 NAME scrapemodlist - generate an up-to-date Perl 5 module list by scraping search.cpan.org =head1 SYNOPSIS scrapemodlist [-p] =head1 DESCRIPTION The Perl module list is no longer maintained as the single text file it used to be. The data is now maintained in a database, searchable through L, but that's not as convenient to browse as the old text file. This program pulls all the data from search.cpan.org and presents it in a close approximation of the traditional format. The result goes to standard output. =head1 OPTIONS =over =item -p Indicate progress on standard error. This is useful in order to localise a problem with the data downloaded from the website. =back =head1 BUGS Makes about a hundred separate page requests of the server, so it's best to save the output to avoid doing this repeatedly. We really ought to cache the pages, but search.cpan.org doesn't give useful timestamps for these pages. Not a perfect match for the hand-maintained module list format. The ordering is perfectly regular, which the old list wasn't in some cases where one module name was a prefix of another. There are no subcategory headings. The chapters are in alphabetical order, because search.cpan.org doesn't give the numbers (this program gives the chapters letters, to avoid confusion with the numbers). The chapter titles are abbreviated, because that's what we get from search.cpan.org. =head1 AUTHOR Andrew Main (Zefram) =head1 COPYRIGHT Copyright (C) 2004 Andrew Main (Zefram) This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.