PERLで正規表現

戻る
#!/usr/bin/perl
# $Id: perl-regex.html,v 1.1 2009/06/22 16:12:19 kishi Exp kishi $
# Description: HTMLを読み込み、正規表現を使って特定の箇所を抽出するテスト

use strict;

if( $#ARGV + 1 != 1 ){ # 引数のチェック
	# print $#ARGV . "\n";
	print STDERR "Usage: singleline.pl [HTML]" . "\n";
	exit -1;
}
my $htmlFile = $ARGV[0];

# 処理対象のHTMLファイルを開く
open(HTML, $htmlFile) || die "cannot open $htmlFile !";

my $buffer;
while (<HTML>) { ### 行末の改行コードはLF(=\n)だけにする ← 注意!!!
    s/\r\n$//;
    s/\n$//;
    $buffer .= $_ . "\n";
}

# HTMLファイルを閉じる
close(HTML);

#======================================================================================================
# 正規表現をもとに必要な箇所を抽出して、XMLとして出力
#======================================================================================================
my $xmlEncoding = "Shift_JIS";
print qq{<?xml version=\"1.0\" encoding=\"$xmlEncoding\" ?>} . "\n";
print "<data>" . "\n";

print &getResult($buffer, "<title>(.+)</title>", "TITLE");

print "</data>" . "\n";

exit(0);

sub getResult() {
	my $result;
	#---------------------------------------
	# 引数受け取り
	#---------------------------------------
	my( $source, $regex, $elementName ) = @_;

	## print "\tELEMENTNAME=[$elementName]" . "\n";
	## print "\tREGEX=[$regex]" . "\n";

	if( $source =~ /$regex/si ){ # 改行コードを無視してマッチング、大文字小文字無視(CASE_INSENSITIVE)
		my $token = $1;
		$token =~ s/^\s*//g;
		$token =~ s/\s*$//g;

		$result = "<" . $elementName . ">" . "\n";
		$result .= &escapeURL($token) . "\n";
		$result .= "</" . $elementName . ">" . "\n";
	}

	return $result;
}

sub escapeURL {
	my $value = shift;
	
	$value =~ s/&/&amp;/g; 

	$value =~ s/</&lt;/g; 
	$value =~ s/>/&gt;/g; 
	$value =~ s/"/&quot;/g; 

	return $value;
}

■以下実行結果

$ ./singleline.pl test.html
<?xml version="1.0" encoding="Shift_JIS" ?>
<data>
<TITLE>
ここにタイトルが入ります!
</TITLE>
</data>

■入力ファイル

$ cat test.html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html lang="ja">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS">
<meta http-equiv="Content-Script-Type" content="text/javascript">
<meta http-equiv="Content-Style-Type" content="text/css">
<meta name="description" content="">
<meta name="keywords" content="">
<link rel="shortcut icon" href="/img/tennis365.ico">
<link rel="stylesheet" href="/common/css/import_t.css" type="text/css">
<script type="text/javascript" src="/common/js/banner_random.js"></script>
<script type="text/javascript" src="/common/js/basic.js"></script>
<title>
ここにタイトルが入ります!
</title>

<SCRIPT LANGUAGE="JavaScript">
var cc_tagVersion = "1.0";
var cc_accountID = "6594445900";
var cc_marketID =  "4";
var cc_protocol="http";
var cc_subdomain = "convctr";
if(location.protocol == "https:")
{
    cc_protocol="https";
     cc_subdomain="convctrs";
}
var cc_queryStr = "?" + "ver=" + cc_tagVersion + "&aID=" + cc_accountID + "&mkt=" + cc_marketID +"&ref=" + escape(document.referrer);
var cc_imageUrl = cc_protocol + "://" + cc_subdomain + ".overture.com/images/cc/cc.gif" + cc_queryStr;
var cc_imageObject = new Image();
cc_imageObject.src = cc_imageUrl;
// -->
</SCRIPT>

</head>
</html>
戻る inserted by FC2 system