开云体育

ctrl + shift + ? for shortcuts
© 2025 开云体育

Re : General: Anyone know how to refrence a webpage and read it?


Amit Kulkarni
 

Scott:
Get wget & perl



wget -O cmeprices.html
"

perl html2text cmeprices.html

then use this script to remove those HTML tags...
And then parse using one of Jan/Feb/Mar/Apr strings...

HTH

#!/usr/bin/perl
######################################################################
# HTML to text converter Version 1.01 #
# Copyright 1999 Frederic TYNDIUK (FTLS) All Rights Reserved. #
# E-Mail: tyndiuk@... Script License: GPL #
# Created 06/30/99 Last Modified 06/30/99 #
# Scripts Archive at: #
######################################################################
# Function : #
# Suppress All HTML TAGs in a file. #
######################################################################
##################### license & copyright header #####################
# #
# Copyright (c) 1999 TYNDIUK Frederic #
# #
# This program is free software; you can redistribute it and/or #
# modify it under the terms of the GNU General Public License as #
# published by the Free Software Foundation; either version 2 of #
# the License, or (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program in the file 'COPYING'; if not, write to #
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, #
# Boston, MA 02111-1307, USA, or contact the author: #
# #
# TYNDIUK Frederic <tyndiuk@...> #
# <> #
# #
################### end license & copyright header ###################
######################################################################
$Version = "1.01";
$Copyright = "HTML to Text converter v$Version (C) 1999 Frederic
TYNDIUK (alias FTLS)&#92;n";
$Copyright .= "Report bugs to tyndiuk&#92;@ftls.org, News and Updates:
;n";
$NameResultFile = $BgColor = "";
$UsePre = $No = 0;


# En: Check Args
# Fr: Verification des aguments
while ($ARGV[0] =~ /^-/) {
if (($ARGV[0] eq "-r") || ($ARGV[0] eq "--result")) {
shift(@ARGV); if($ARGV[0]) { $NameResultFile = $ARGV[0];
$NameResultFile =~ s/&#92;.txt//;} }
if (($ARGV[0] eq "-v") || ($ARGV[0] eq "--version")) { print
$Copyright; exit; }
if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--help")) { &Usage(); }
shift(@ARGV);
}

if (@ARGV < 1) {
&Usage;
}

while ($ARGV[0] ne "") {
$SourceFile = $ARGV[0];
if ($NameResultFile eq "") {
$ResultFile = $SourceFile;
$ResultFile =~ s/&#92;.&#92;w*//;
} else {
$ResultFile = $NameResultFile;
}
if (($No > 0) && ($NameResultFile ne "")) {
$ResultFile .= "-".$No;
}
$No++;
$ResultFile .= ".txt";
&html2txt($SourceFile, $ResultFile);
shift(@ARGV);
}

sub html2txt {
my($SourceFile, $ResultFile) = @_;
my($HTML, @HTMLPage, $SymbLine, $ascii, $html);

open(FILE, "$SourceFile") || die("Cannot open HTML source file :
$SourceFile, Error $!&#92;n");
@HTMLPage = <FILE>;
close(FILE);

$HTML = join("", @HTMLPage);
($Head, $HTML) = split(/<&#92;/HEAD>/i, $HTML);

$HTML =~ s/&nbsp;/ /g;
$HTML =~ s/&#92;s&#92;s*/ /g;
$HTML =~ s/<p[^>]*>/&#92;n&#92;n/gi; #<p> -> &#92;n&#92;n
$HTML =~
s/<br>|<&#92;/*h[1-6][^>]*>|<li[^>]*>|<dt[^>]*>|<dd[^>]*>|<&#92;/tr[^>]*>/&#92;n/gi;

# <br> or <H*> or <li> or </tr> or <dt> or <dd> -> &#92;n
$HTML =~ s/(<[^>]*>)*//g;
$HTML =~ s/&#92;n&#92;s*&#92;n&#92;s*/&#92;n&#92;n/g;
$HTML =~ s/&#92;n */&#92;n/g;
foreach $SymbLine (&HTMLSymb) {
($ascii, $html) = split(/&#92;s&#92;s*/,$SymbLine);
$HTML =~ s/$html/$ascii/g;
}

# En: Write result file
# Fr: Ecrit le fichier resultat
open(COM, ">$ResultFile") || die("Cannot write file $ResultFile, Error
$!");
print COM $HTML;
close(COM);
}

sub Usage {
print STDERR <<EOF;
$Copyright
Usage: $0 [Options] HTMLFile...
Options:
-r File, --result File result file Name (without .txt)
-v, --version output version information and
exit
-h, --help display this help and exit
EOF
exit 1;
}

# HTML Codes
sub HTMLSymb {
return (
"& &amp;",
"&#92;" "",
"< <",
"> >",
"? &copy;",
"? &reg;",
"? &AElig;",
"? &Aacute;",
"? &Acirc;",
"? &Agrave;",
"? &Aring;",
"? &Atilde;",
"? &Auml;",
"? &Ccedil;",
"? &ETH;",
"? &Eacute;",
"? &Ecirc;",
"? &Egrave;",
"? &Euml;",
"? &Iacute;",
"? &Icirc;",
"? &Igrave;",
"? &Iuml;",
"? &Ntilde;",
"? &Oacute;",
"? &Ocirc;",
"? &Ograve;",
"? &Oslash;",
"? &Otilde;",
"? &Ouml;",
"? &THORN;",
"? &Uacute;",
"? &Ucirc;",
"? &Ugrave;",
"? &Uuml;",
"? &Yacute;",
"? &aacute;",
"? &acirc;",
"? &aelig;",
"? &agrave;",
"? &aring;",
"? &atilde;",
"? &auml;",
"? &ccedil;",
"? &eacute;",
"? &ecirc;",
"? &egrave;",
"? &eth;",
"? &euml;",
"? &iacute;",
"? &icirc;",
"? &igrave;",
"? &iuml;",
"? &ntilde;",
"? &oacute;",
"? &ocirc;",
"? &ograve;",
"? &oslash;",
"? &otilde;",
"? &ouml;",
"? &szlig;",
"? &thorn;",
"? &uacute;",
"? &ucirc;",
"? &ugrave;",
"? &uuml;",
"? &yacute;",
"? &yuml;",
" &#160;",
"? &#161;",
"? &#162;",
"? &#163;",
"? &#165;",
"? &#166;",
"? &#167;",
"? &#168;",
"? &#169;",
"? &#170;",
"? &#171;",
"? &#172;",
"? &#173;",
"? &#174;",
"? &#175;",
"? &#176;",
"? &#177;",
"? &#178;",
"? &#179;",
"? &#180;",
"? &#181;",
"? &#182;",
"? &#183;",
"? &#184;",
"? &#185;",
"? &#186;",
"? &#187;",
"? &#188;",
"? &#189;",
"? &#190;",
"? &#191;",
"? &#215;",
"? &#222;",
"? &#247;")
}

__________________________________________________
Do You Yahoo!?
Yahoo! - Official partner of 2002 FIFA World Cup