#!/usr/bin/perl

use warnings;
use strict;

use Getopt::Std qw(getopts);
use HTML::Entities qw(decode_entities);
use LWP::Simple ();
use POSIX qw(strftime);
use URI;

our $VERSION = "0.000";

my $top_uri = "http://search.cpan.org/modlist/";

sub get($) {
	my($uri) = @_;
	my $content = LWP::Simple::get($uri);
	die "failed to download $uri\n" unless defined $content;
	$content;
}

my $top_page = get($top_uri);

my %opts;
unless(getopts("p", \%opts)) {
	print STDERR "$0: usage: $0 [-p]\n";
	exit 2;
}

my $start_time = time;

my @categories = ();
while($top_page =~ m{<a href="(/modlist/\w+)">([^<]+)</a>}g) {
	push @categories, {
		reluri => $1,
		name => $2,
	};
}
@categories = sort { $a->{name} cmp $b->{name} } @categories;

sub parse_row($) {
	my($row) = @_;
	my @cells = ();
	while($row =~ m{>\s*([^\s<][^<]*)<}g) {
		my $content = decode_entities($1);
		$content =~ tr/\xa0/ /;
		push @cells, $content;
	}
	unless(@cells == 4) {
		warn "bad table row >>>>$row<<<<";
		return undef;
	}
	my $module = {
		name => $cells[0],
		dslip => $cells[1],
		description => $cells[2],
		info => $cells[3],
	};
	unless($module->{name} =~ /^((?:.*::)?)(\w+)$/) {
		warn "bad module name `$module->{name}'";
		return undef;
	}
	$module->{namespace} = $1;
	$module->{leafname} = $2;
	$module;
}

my $chapter = "a";
foreach my $category (@categories) {
	$category->{chapter} = $chapter++;
	$category->{modules} = {};
	my @uris_to_do = $category->{reluri};
	my %uris_done = ();
	while(@uris_to_do) {
		my $reluri = pop @uris_to_do;
		print STDERR "examining $reluri...\n" if $opts{p};
		$uris_done{$reluri} = undef;
		my $page = get(URI->new_abs($reluri, $top_uri));
		while($page =~ m{<a href="(/modlist/[\w/]+)">([^<]+)</a>}g) {
			push @uris_to_do, $1 unless exists $uris_done{$1};
		}
		while($page =~ m{<tr class=[rs]>(.*?)</tr>}sg) {
			my $module = parse_row($1);
			next unless defined $module;
			$category->{modules}->{$module->{name}} = $module;
		}
	}
}

print "=" x 71, "\n\n",
	"Perl 5 module list\n\n",
	"Scraped from <", $top_uri, ">,\nstarting ",
	strftime("%Y-%m-%dT%H:%M:%SZ", gmtime($start_time)), ".\n\n",
	"scrapemodlist version ", $VERSION, "\n\n",
	"_" x 71, "\n\n",
	"CONTENTS\n\n";
foreach my $category (@categories) {
	printf "%-3s  %s\n", $category->{chapter}.")", $category->{name};
}

foreach my $category (@categories) {
	print "\n", "_" x 71, "\n\n",
		$category->{chapter}, ") ", $category->{name}, "\n\n",
		"Name               DSLIP Description",
			"                                  Info\n",
		"------------       ----- -----------",
			"--------------------------------- ----";
	my $ns = "0";
	foreach my $module (sort { $a->{namespace} cmp $b->{namespace} ||
				$a->{leafname} cmp $b->{leafname} }
					values %{$category->{modules}}) {
		if($module->{namespace} ne $ns) {
			$ns = $module->{namespace};
			print "\n";
			print $ns, "\n" if $ns;
		}
		printf "%-18s %-5s %-44s %s\n",
			($ns ? "::" : "").$module->{leafname},
			$module->{dslip}, $module->{description},
			$module->{info};
	}
}

print "\n", "=" x 71, "\n";

exit 0;

=head1 NAME

scrapemodlist - generate an up-to-date Perl 5 module list by scraping
search.cpan.org

=head1 SYNOPSIS

scrapemodlist [-p]

=head1 DESCRIPTION

The Perl module list is no longer maintained as the single text file it
used to be.  The data is now maintained in a database, searchable through
L<http://search.cpan.org>, but that's not as convenient to browse as the
old text file.  This program pulls all the data from search.cpan.org
and presents it in a close approximation of the traditional format.
The result goes to standard output.

=head1 OPTIONS

=over

=item -p

Indicate progress on standard error.  This is useful in order to localise
a problem with the data downloaded from the website.

=back

=head1 BUGS

Makes about a hundred separate page requests of the server, so it's best
to save the output to avoid doing this repeatedly.

We really ought to cache the pages, but search.cpan.org doesn't give
useful timestamps for these pages.

Not a perfect match for the hand-maintained module list format.
The ordering is perfectly regular, which the old list wasn't in some
cases where one module name was a prefix of another.  There are no
subcategory headings.  The chapters are in alphabetical order, because
search.cpan.org doesn't give the numbers (this program gives the chapters
letters, to avoid confusion with the numbers).  The chapter titles are
abbreviated, because that's what we get from search.cpan.org.

=head1 AUTHOR

Andrew Main (Zefram) <zefram@fysh.org>

=head1 COPYRIGHT

Copyright (C) 2004 Andrew Main (Zefram) <zefram@fysh.org>

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.