正規表現 -- URLからホスト名、ポート番号、PATHを切り出す

戻る


::::::::::::::
test1.pl
::::::::::::::
#!/usr/bin/perl
# $Id: regex-perl-and-c.html,v 1.1 2009/06/22 16:12:23 kishi Exp kishi $

use strict;

my @URLs =  (
	"http://asistobe851.hp.infoseek.co.jp/my-memo/perl-regex.html",
	"http://asistobe851.hp.infoseek.co.jp:8080/my-memo/",
	"http://asistobe851.hp.infoseek.co.jp:80/my-memo/index.html"
	);

foreach my $target (@URLs){

	print STDERR $target . "\n";	

	my ( $hostname, $portnum, $path );

	if( $target =~ /http:\/\/(.+?)(\/.+)/si ){
		$hostname = $1;
		$path = $2;

		if( $hostname =~ /(.+):(.+)/si ){
			$hostname = $1;
			$portnum = $2 + 0;
		} else {
			$portnum = 80;
		}
	}

	printf( "\thostname: %s\n", $hostname);
	printf( "\tportnum: %d\n", $portnum);
	printf( "\tpath: %s\n", $path);

}
::::::::::::::
Makefile
::::::::::::::
# $Id: regex-perl-and-c.html,v 1.1 2009/06/22 16:12:23 kishi Exp kishi $

CFLAGS = -c -Wall -O2
OBJECT = test1.o
EXE = test1.exe

$(EXE): $(OBJECT)
	gcc $(OBJECT) -o $(EXE) && strip $(EXE)

test1.o:  test1.c
	gcc $(CFLAGS) test1.c

clean:
	rm -f *.o *.exe *~ *.bak *.orig

indent:
	astyle -c -a -P *.c

::::::::::::::
test1.c
::::::::::::::
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>

/**
$Id: regex-perl-and-c.html,v 1.1 2009/06/22 16:12:23 kishi Exp kishi $ 
 
だらだらとコピペコードが多くて恐縮です
とりあえずやっつけで作っております。

はっきり言って美しくない⇒今後改善予定!
*/

typedef struct {
    int result;
    char *hostname;
    int portnum;
    char *path;
}
Tokens;

Tokens
getTokens ( char *input ) {
    Tokens tokens;
    int regsuccess;		
    int nmatch = 3;
    regex_t reg;	
    regmatch_t match[ nmatch ];	

    tokens.result = -1;

    regcomp ( &reg, "http://([[:alnum:].~?&%#_=-\\:]+)(\\/?.*)", REG_EXTENDED );
    regsuccess = regexec ( &reg, input, nmatch, match, 0 );
    regfree ( &reg );

    if ( regsuccess != 0 ) {
        return tokens;
    }

    int size;

    size = ( int ) ( match[ 1 ].rm_eo - match[ 1 ].rm_so + 1 );
    tokens.hostname = ( char * ) malloc ( size + 1 );
    memset ( tokens.hostname, 0, size );
    memcpy ( tokens.hostname, input + match[ 1 ].rm_so, size - 1 );

    size = ( int ) ( match[ 2 ].rm_eo - match[ 2 ].rm_so + 1 );
    tokens.path = ( char * ) malloc ( size );
    memset ( tokens.path, 0, size );
    memcpy ( tokens.path, input + match[ 2 ].rm_so, size - 1 );

    // ポート番号を求める
    char *temp_str = ( char * ) malloc ( strlen ( tokens.hostname ) );
    strcpy ( temp_str, tokens.hostname );

    regcomp ( &reg, "(.+):(.+)", REG_EXTENDED );
    regsuccess = regexec ( &reg, temp_str, nmatch, match, 0 );
    regfree ( &reg );

    if ( regsuccess == 0 ) {
        size = ( int ) ( match[ 1 ].rm_eo - match[ 1 ].rm_so + 1 );
        memset ( tokens.hostname, 0, size );
        memcpy ( tokens.hostname, input + match[ 1 ].rm_so, size - 1 );

        size = ( int ) ( match[ 2 ].rm_eo - match[ 2 ].rm_so + 1 );
        char *port_str = ( char * ) malloc ( size );
        memset ( port_str, 0, size );
        memcpy ( port_str, temp_str + match[ 2 ].rm_so, size - 1 );

        tokens.portnum = atoi ( port_str );

    } else {
        tokens.portnum = 80;
    }

    tokens.result = 0;		// SUCCESS

    return tokens;
}

int
main ( int argc, char **argv ) {
    char * URLs[] = {
                        "http://asistobe851.hp.infoseek.co.jp/my-memo/perl-regex.html",
                        "http://asistobe851.hp.infoseek.co.jp:8080/my-memo/",
                        "http://asistobe851.hp.infoseek.co.jp:80/my-memo/index.html",
                    };

    //-----------------------------------
    // 設定された要素数を求める
    //-----------------------------------
    int count = sizeof ( URLs ) / sizeof ( *URLs );

    int i;
    for ( i = 0; i < count; i++ ) {
        fprintf ( stderr, "%s\n", URLs[ i ] );

        Tokens tokens = getTokens ( URLs[ i ] );

        if ( tokens.result == 0 ) {
            printf ( "-- %s\n", tokens.hostname );
            printf ( "-- %d\n", tokens.portnum );
            printf ( "-- %s\n", tokens.path );
        }
    }

    return 0;
}


■実行結果

$ ./test1.pl
http://asistobe851.hp.infoseek.co.jp/my-memo/perl-regex.html
        hostname: asistobe851.hp.infoseek.co.jp
        portnum: 80
        path: /my-memo/perl-regex.html
http://asistobe851.hp.infoseek.co.jp:8080/my-memo/
        hostname: asistobe851.hp.infoseek.co.jp
        portnum: 8080
        path: /my-memo/
http://asistobe851.hp.infoseek.co.jp:80/my-memo/index.html
        hostname: asistobe851.hp.infoseek.co.jp
        portnum: 80
        path: /my-memo/index.html

$ ./test1
http://asistobe851.hp.infoseek.co.jp/my-memo/perl-regex.html
-- asistobe851.hp.infoseek.co.jp
-- 80
-- /my-memo/perl-regex.html
http://asistobe851.hp.infoseek.co.jp:8080/my-memo/
-- http://asistobe851.hp.infosee
-- 8080
-- /my-memo/
http://asistobe851.hp.infoseek.co.jp:80/my-memo/index.html
-- http://asistobe851.hp.infosee
-- 80
-- /my-memo/index.html
戻る inserted by FC2 system