PERLで正規表現
戻る
#!/usr/bin/perl
# $Id: perl-regex.html,v 1.1 2009/06/22 16:12:19 kishi Exp kishi $
# Description: HTMLを読み込み、正規表現を使って特定の箇所を抽出するテスト
use strict;
if( $#ARGV + 1 != 1 ){ # 引数のチェック
# print $#ARGV . "\n";
print STDERR "Usage: singleline.pl [HTML]" . "\n";
exit -1;
}
my $htmlFile = $ARGV[0];
# 処理対象のHTMLファイルを開く
open(HTML, $htmlFile) || die "cannot open $htmlFile !";
my $buffer;
while (<HTML>) { ### 行末の改行コードはLF(=\n)だけにする ← 注意!!!
s/\r\n$//;
s/\n$//;
$buffer .= $_ . "\n";
}
# HTMLファイルを閉じる
close(HTML);
#======================================================================================================
# 正規表現をもとに必要な箇所を抽出して、XMLとして出力
#======================================================================================================
my $xmlEncoding = "Shift_JIS";
print qq{<?xml version=\"1.0\" encoding=\"$xmlEncoding\" ?>} . "\n";
print "<data>" . "\n";
print &getResult($buffer, "<title>(.+)</title>", "TITLE");
print "</data>" . "\n";
exit(0);
sub getResult() {
my $result;
#---------------------------------------
# 引数受け取り
#---------------------------------------
my( $source, $regex, $elementName ) = @_;
## print "\tELEMENTNAME=[$elementName]" . "\n";
## print "\tREGEX=[$regex]" . "\n";
if( $source =~ /$regex/si ){ # 改行コードを無視してマッチング、大文字小文字無視(CASE_INSENSITIVE)
my $token = $1;
$token =~ s/^\s*//g;
$token =~ s/\s*$//g;
$result = "<" . $elementName . ">" . "\n";
$result .= &escapeURL($token) . "\n";
$result .= "</" . $elementName . ">" . "\n";
}
return $result;
}
sub escapeURL {
my $value = shift;
$value =~ s/&/&/g;
$value =~ s/</</g;
$value =~ s/>/>/g;
$value =~ s/"/"/g;
return $value;
}
■以下実行結果
$ ./singleline.pl test.html
<?xml version="1.0" encoding="Shift_JIS" ?>
<data>
<TITLE>
ここにタイトルが入ります!
</TITLE>
</data>
■入力ファイル
$ cat test.html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html lang="ja">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS">
<meta http-equiv="Content-Script-Type" content="text/javascript">
<meta http-equiv="Content-Style-Type" content="text/css">
<meta name="description" content="">
<meta name="keywords" content="">
<link rel="shortcut icon" href="/img/tennis365.ico">
<link rel="stylesheet" href="/common/css/import_t.css" type="text/css">
<script type="text/javascript" src="/common/js/banner_random.js"></script>
<script type="text/javascript" src="/common/js/basic.js"></script>
<title>
ここにタイトルが入ります!
</title>
<SCRIPT LANGUAGE="JavaScript">
var cc_tagVersion = "1.0";
var cc_accountID = "6594445900";
var cc_marketID = "4";
var cc_protocol="http";
var cc_subdomain = "convctr";
if(location.protocol == "https:")
{
cc_protocol="https";
cc_subdomain="convctrs";
}
var cc_queryStr = "?" + "ver=" + cc_tagVersion + "&aID=" + cc_accountID + "&mkt=" + cc_marketID +"&ref=" + escape(document.referrer);
var cc_imageUrl = cc_protocol + "://" + cc_subdomain + ".overture.com/images/cc/cc.gif" + cc_queryStr;
var cc_imageObject = new Image();
cc_imageObject.src = cc_imageUrl;
// -->
</SCRIPT>
</head>
</html>
戻る