news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news Tue Mar 9 09:38:59 CST 1993 Article: 1468 of comp.lang.perl Xref: feenix.metronet.com comp.lang.perl:1468 Path: feenix.metronet.com!news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news From: jv@mh.nl (Johan Vromans) Newsgroups: comp.lang.perl #Subject: WordPerfect document scanner Message-ID: Date: 9 Mar 93 13:18:08 GMT Sender: news@pronto.mh.nl (USEnet News System) Followup-To: alt.sources.d Organization: Multihouse Automation, the Netherlands Lines: 288 X-Md4-Signature: d90d6f205746a8854b154ba6d0177648 Nntp-Posting-Host: largo:(jv) This program reads a WordPerfect document and outputs its contents in a way suitable to study and analyze it. It can be used as a driver to perform your own WP to whatever conversions. Disclaimer: this program is not supported, use at your own risk. Johan ---- Cut Here and feed the following to sh ---- #!/bin/sh # This is a shell archive (produced by shar 3.49) # To extract the files from this archive, save it to a file, remove # everything above the "!/bin/sh" line above, and type "sh file_name". # # made 03/01/1993 15:47 UTC by jv@largo # Source directory /a/pronto/mozart/users/jv # # existing files will NOT be overwritten unless -c is specified # # This shar contains: # length mode name # ------ ---------- ------------------------------------------ # 6374 -rw-r--r-- wpscan.pl # # ============= wpscan.pl ============== if test -f 'wpscan.pl' -a X"$1" != X"-c"; then echo 'x - skipping wpscan.pl (File already exists)' else echo 'x - extracting wpscan.pl (Text)' sed 's/^X//' << 'SHAR_EOF' > 'wpscan.pl' && X#!/usr/local/bin/perl X# wp2txt.pl -- convert WP document to MH-Doc format X# SCCS Status : %Z%@ %M% %I% X# Author : Johan Vromans X# Created On : Mon Jul 13 21:23:19 1992 X# Last Modified By: Johan Vromans X# Last Modified On: Mon Mar 1 16:43:01 1993 X# Update Count : 315 X# Status : Unknown, Use with caution! X X################ Common stuff ################ X X# $LIBDIR = $ENV{"MH_DOCLIB"} || "/usr/local/lib/mh_doc"; X# unshift (@INC, $LIBDIR); X# require "common.pl"; X$my_package = "PRONTO/MH-Doc V1.02"; X$my_version = "%I%"; X$my_name = "%M%"; X X################ Program parameters ################ X X$opt_start = $opt_end = 0; X$opt_outline = 0; X$opt_analyze = 1; X$opt_short = 0; X$opt_attr = 0; X$opt_header = 1; X$opt_verbose = 0; X&options if $ARGV[0] =~ /^-/; X X################ Presets ################ X X@ARGV = ("-") unless @ARGV; Xopen (WPD, @ARGV[0]) || die ("$ARGV[0]: $!\n"); X&wpd_check; X Xif ( $opt_analyze ) { X &analyze; X} Xelse { X &convert; X} X Xexit (0); X X################ Subroutines ################ X X# Extract a two-byte (little endian) word from the input. Xsub opc_word { X shift (@opc) | (shift (@opc) << 8); X} X X# Skip a number of bytes. Xsub opc_skip { X local ($dis) = @_; X splice (@opc, 0, $dis); X} X X# Verify validity of WP document. Xsub wpd_check { X X local ($hdr) = ''; X X sysread (WPD, $hdr, 16); X local ($id, $std, $filetype) = unpack ("a4Vv", $hdr); X X if ($id eq "\xffWPC" && X $filetype == 0x0A01 ) { X X # Seek to start of document, and slurp it. X $opt_start = $std unless $opt_start; X seek (WPD, $opt_start, 0); X local ($/) = undef; X $wpd = ; X $wpdptr = $opt_start; X X # Truncate if requested. X $end = $wpdptr + length ($wpd) - 1; X if ( $opt_end > $opt_start ) { X substr ($wpd, $opt_end-$wpdptr) = ''; X } X else { X $opt_end = $end; X } X X # Feedback X if ( $opt_debug || $opt_analyze ) { X printf STDERR ("Startdoc @ 0x%x (%d)%s\n", $std, $std, X ($opt_start != $std || $opt_end != $end) ? X sprintf (", range is 0x%x (%d) thru 0x%x (%d)", X $opt_start, $opt_start, X $opt_end-1, $opt_end-1) : ''); X } X } X else { X die ("Not a WP document\n"); X } X} X X# The main processors -- X Xsub analyze { X X local ($opc); # opcode under examination X local ($opc_type); # type (0 = string, 1 = bytes) X local ($opc_len); # total length op opcode sequence X local ($opc_char); # text, if $opc_type == 0 X local (@opc); # bytes, if $opc_type == 1 X X while ( &fetch >= 0 ) { ; } # &fetch will do all X} X Xsub options { X X require "newgetopt.pl"; X local ($opt_noheader) = 0; X X $opt_help = $opt_ident = 0; X X if ( ! &NGetOpt X ("start=s", "end=s", X "analyze", "short", "outline", X "attr", "header", "noheader", X "verbose", "ident", "quiet", "help", "debug") X || $opt_help ) { X print STDERR <$opt_output"); X } X X if ( $opt_analyze ) { X $opt_analyze = 2 unless $opt_short; X open (STDERR, ">&STDOUT"); X if ( -t STDERR ) { X select (STDERR); X $| = 1; X } X } X X $opt_header = !$opt_noheader; X print STDERR ("This is $my_package [$my_name $my_version]") if $opt_ident; X} X Xsub fetch { X X # Fetch the next opcode from the input. X # X # $opc_type will designate the type of opcode: X # 0 -> sequence of ASCII characters, delivered in $opc_char. X # 1 -> sequence of bytes, delevired in @opc. X # $opc_len will hold the number of bytes involved. X # X # Return value is $opc_type, or -1 if exhausted. X X return -1 if $wpd eq ''; X X if ( $wpd =~ /^[ -~]+/ ) { X $opc_char = $&; X $wpd = $'; X $opc_len = length ($opc_char); X $opc_type = 0; X printf STDERR ("%04x: %s\n", $wpdptr, $opc_char) X if $opt_debug || $opt_analyze; X $wpdptr += $opc_len; X return $opc_type; X } X X $opc = ord ($wpd); X $opc_type = 1; X $opc_len = -1; X if ( $opc <= 037 || $opc == 0x7f ) { $opc_len = 1; } X elsif ( $opc >= 0x80 && $opc <= 0xbf ) { $opc_len = 1; } X elsif ( $opc == 0xc0 ) { $opc_len = 4; } X elsif ( $opc == 0xc1 ) { $opc_len = 9; } X elsif ( $opc == 0xc2 ) { $opc_len = 11; } X elsif ( $opc == 0xc3 || $opc == 0xc4 ) { $opc_len = 3; } X elsif ( $opc == 0xc5 ) { $opc_len = 5; } X elsif ( $opc == 0xc6 ) { $opc_len = 6; } X elsif ( $opc == 0xc7 ) { $opc_len = 7; } X elsif ( $opc >= 0xc8 && $opc <= 0xcf ) { X printf STDERR ("%04x: [Reserved opcode %02x]\n", $wpdptr, $opc); X $opc_len = 1; X } X X if ( $opc_len > 0 ) { X @opc = unpack ("C*", substr ($wpd, 0, $opc_len)); X substr ($wpd, 0, $opc_len) = ''; X if ( $opt_debug || $opt_analyze ) { X printf STDERR ("%04x: [" . join(" ", ("%02x") x $opc_len) . "]\n", X $wpdptr, @opc); X } X die ("*** PHASE ERROR ***\n") unless $opc == $opc[$#opc]; X $wpdptr += $opc_len; X return $opc_type; X } X X $opc_len = unpack ("v", substr ($wpd, 2, 2)); X @opc = unpack ("C*", substr ($wpd, 0, $opc_len+4)); X substr ($wpd, 0, $opc_len+4) = ''; X if ( $opt_debug || $opt_analyze ) { X local (@opcb) = @opc; X local (@finals) = splice (@opcb, $#opcb-3); X printf STDERR ("%04x: [%02x %02x %02x%02x (%d)", $wpdptr, X splice (@opcb, 0, 4), $opc_len); X if ( $opt_analyze == 1 && @opcb > 6 ) { X printf STDERR (" %02x %02x ... %02x %02x", X $opcb[0], $opcb[1], $opcb[$#opcb-1], $opcb[$#opcb]); X } X else { X foreach $b ( @opcb ) { X printf STDERR (" %02x", $b); X } X } X printf STDERR (" %02x%02x %02x %02x]\n", @finals); X } X die ("*** OPCODE LENGHT ERROR ***\n") X unless ($opc[$#opc-3] | ($opc[$#opc-2] << 8)) == $opc_len; X $wpdptr += $opc_len += 4; X die ("*** OPCODE SUBFUN ERROR ***\n") unless $opc[1] == $opc[$#opc-1]; X die ("*** OPCODE ERROR ***\n") unless $opc == $opc[$#opc]; X return $opc_type; X} SHAR_EOF chmod 0644 wpscan.pl || echo 'restore of wpscan.pl failed' Wc_c="`wc -c < 'wpscan.pl'`" test 6374 -eq "$Wc_c" || echo 'wpscan.pl: original size 6374, current size' "$Wc_c" fi exit 0 -- Johan Vromans jv@mh.nl via internet backbones Multihouse Automatisering bv uucp:..!{uunet,sun4nl}!mh.nl!jv Doesburgweg 7, 2803 PL Gouda, The Netherlands phone/fax: +31 1820 62911/62500 ------------------------ "Arms are made for hugging" -------------------------