[파이썬(Python)] #9. re 모듈, 정규식 표현

python

by 빨간눈동자 2021. 8. 22. 23:20

# test.py
import re

a = 'abc\tdef'
b = r'abc\tdef'

print(f'a = {a}')
print(f'b = {b}')

일반적으로 문자열('') 안의 이스케이프 문자(\t)는 특수한 의미를 가진다. '\t'는 tab 공백을 삽입한다.

하지만, 문자열 앞에 r 을 붙이게 되면, 이스케이프 문자 그 자체를 문자로 인식한다.

import re

c = re.search(r'abc', '123abcde')
print(f'c = {c}')

d = re.search(r'abc', '1234acde')
print(f'd = {d}')

re.search( 패턴, 문자열)

[] 문자들의 범위를 나타내가 위해 사용

[abck] : a or b or c or k
[abc.^] : a or b or c or . or ^
[a-d] : -와 함께 사용되면 해당 문자 사이의 범위에 속하는 문자 중 하나
[0-9] : 모든 숫자
[a-z] : 모든 소문자
[A-Z] : 모든 대문자
[a-zA-Z0-9] : 모든 알파벳 문자 및 숫자
[^0-9] : ^가 맨 앞에 사용 되는 경우 해당 문자 패턴이 아닌 것과 매칭

import re

e = re.search(r'[abc]at', 'cat')
f = re.search(r'[abc]at', 'dat')
g = re.search(r'[0-9]%', '5%')
print(f'e = {e}')
print(f'f = {f}')
print(f'g = {g}')

.(마침표)는 모든 문자를 의미

import re

h = re.search(r'c.t', 'cat')
i = re.search(r'c.t', 'c1t')

print(f'h = {h}')
print(f'i = {i}')

\d : 숫자를 [0-9]와 동일
\D : 숫자가 아닌 문자 [^0-9]와 동일
\s : 공백 문자(띄어쓰기, 탭, 엔터 등)
\S : 공백이 아닌 문자
\w : 알파벳대소문자, 숫자 [0-9a-zA-Z]와 동일
\W : non alpha-numeric 문자 [^0-9a-zA-Z]와 동일
\t, \n, \r - tab, newline, return
\. : .
\\ : \

j = re.search(r'\dabc\d', '012abc012')
print(f'j = {j}')

반복패턴
'+' : 1번 이상의 패턴이 발생
'*' : 0번 이상의 패턴이 발생
'?' : 0 혹은 1번의 패턴이 발생

import re

k = re.search(r'ab+c', 'abbbc')
print(f'k = {k}')

l = re.search(r'a[bcd]*e', 'ae')
print(f'l = {l}')

m = re.search(r'a\w+a', 'abxedfea')
print(f'm = {m}')

^ : 문자열의 맨 앞부터 일치하는 경우 검색
$ : 문자열의 맨 뒤부터 일치하는 경우 검색
^ = 시작, $ = 끝 : 각각 문자열의 시작과 끝을 의미

import re

n = re.search(r'^ap+le', 'apple')
print(f'n = {n}')

o = re.search(r'o[rang]+e$', 'oange')
print(f'o = {o}')

()을 사용하여 grouping

import re

p = re.search(r'(\w+)@(\w+.\w+)', 'test@gmail.com')
print(f'p = {p}')
print(f'p.group(0) = {p.group(0)}')
print(f'p.group(1) = {p.group(1)}')
print(f'p.group(2) = {p.group(2)}')

{}를 사용하여 반복횟수 명시 가능

import re

q = re.search(r'ap{3,5}le', 'appple')     # p 3번 반복
r = re.search(r'ap{3,5}le', 'appppple')   # p 5번 반복
s = re.search(r'ap{3,5}le', 'apppppple')  # p 6번 반복

print(f'q = {q}')
print(f'r = {r}')
print(f's = {s}')

?를 사용하면 최소 match
{m, n} : m번에서 n번 반복하면 okay ( maximum )
{m, n} : m번만 반복해도 okay ( minimum )

import re

t = re.search(r'a{3,6}', 'aaaaaa')
u = re.search(r'a{3,6}?', 'aaaaaa')

print(f't = {t}')
print(f'u = {u}')

re.match() : re.search와 유사하나, 주어진 문자열의 시작부터 비교

import re

v = re.match(r'ab+c', 'bbbbbbc')
w = re.match(r'ab+c', 'abbbbbbc')
print(f'v = {v}')
print(f'w = {w}')

re.search가 최초 match되는 패턴을 찾는다면,
findall은 매칭되는 전체 패턴을 찾음

import re

x = re.findall(r'[\w\d]+@[\w.]+', 'test1@gmail.com is my e-mail address. please remember that test2@gmail.com')
print(f'x[0] = {x[0]}')
print(f'x[1] = {x[1]}')

re.sub : 주어진 문자열에 일치하는 모든 패턴을 replace함

import re

y = re.sub(r'[\w\d]+@[\w.]+', '----', 'test1@gmail.com is my e-mail address. please remember that test2@gmail.com', count=1)
print(f'y = {y}')

compile : 동일한 정규식으로 re.RegexObject 객체로 저장하여 사용

import re

z_tmp = re.compile(r'[\d]+@[\w.]+')
z = z_tmp.search('123@gmail.com is my e-mail address')
print(f'z = {z}')

compile 옵션

DOTALL, S : 줄바꿈 문자를 포함하여 모든 문자와 매치

import re
p = re.compile('a.b')
m = p.match('a\nb')
print(m)

import re
p = re.compile('a.b', re.DOTALL)
m = p.match('a\nb')
print(m)

IGNORECASE, I : 대소문자에 관계없이 매치

import re
p = re.compile('[a-z]*', re.IGNORECASE)
print(p.match('python'))
print(p.match('Python'))
print(p.match('PYTHON'))

MULTILINE, M : 여러 줄과 매치

import re
p = re.compile('^python\s\d')

data = '''python 1
apple 2
python 3
cat 3
meat 4
'''

print(p.findall(data))

import re
p = re.compile('^python\s\d', re.MULTILINE)  # option 추가

data = '''python 1
apple 2
python 3
cat 3
meat 4
'''

print(p.findall(data))

VERBOSE, X : verbose 모드를 사용 (주석을 사용하여 정규식을 보기 편하게 만들 수 있다.)

import re

a = re.compile(r'&[#](0[0-7]+|[0-9]+|x[0-9a-fA-F]+);')
  
b = re.compile(r"""
&[#]               # Start of a numeric entity reference
(
0[0-7]+            # Octal form
| [0-9]+           # Decimal form
| x[0-9a-fA-F]+    # Hexadecimal form
)
;                  # Trailing semicolon
""", re.VERBOSE)
print(a.match(r'&#x0acdf;'))
print(b.match(r'&#x0acdf;'))

a와 b를 비교해보면, 결과는 동일하지만 b의 경우 주석을 사용하여 정규식을 설명해주기 때문에 가독성이 높다.

캡쳐

원하는 부분을 추출하고 싶을때 사용한다. 추출하고 싶은 부분을 ( ) 로 묶는다.

간단한 예제를 살펴보자

# test.py
import re
m = re.match('[\w]+\s[\w]+\s[\w]+\s[\w]+\s(\d\d\d)[-](\d\d\d\d)[-](\d\d\d\d)', 'My phone number is 010-1234-1234')
print(m.group())
print(m.group(1))
print(m.group(2))
print(m.group(3))

참고 : https://regexper.com/

'python' 카테고리의 다른 글

[파이썬(Python)] #14. 자료형 (딕셔너리(dict)) (0)	2021.08.22
[파이썬(Python)] #10 모듈 생성 및 시작 (0)	2021.08.22
[파이썬(Python)] #24. with 구문 (0)	2021.08.22
[파이썬(Python)] #6. list comprehension (0)	2021.08.22
[파이썬(Python)] #22. for문 / enumerate 내장함수사용 (0)	2021.08.20