Skip to main content

Python 正则表达式

Python 正则表达式高级应用

本章节主要讲解 Python正则表达式的 前瞻和后顾断言,反向引用,邮箱验证,提取网页链接,文本清理和格式化,日志分析,编译复用,选择合适的函数,转义问题,调试正则表达式。

1. 前瞻和后顾断言

import re

# 正向前瞻 (?=...)
text = "Java JavaScript Python"
# 匹配后面跟着Script的Java
java_before_script = re.findall(r"Java(?=Script)", text)
print(java_before_script)  # ['Java']

# 负向前瞻 (?!...)
# 匹配后面不跟着Script的Java
java_not_before_script = re.findall(r"Java(?!Script)", text)
print(java_not_before_script)  # ['Java'] (来自独立的Java)

# 正向后顾 (?<=...)
prices = "¥100 $200 €300"
# 匹配前面是¥的数字
rmb_prices = re.findall(r"(?<=¥)\d+", prices)
print(rmb_prices)  # ['100']

# 负向后顾 (?<!...)
# 匹配前面不是¥的数字
non_rmb_prices = re.findall(r"(?<!¥)\d+", prices)
print(non_rmb_prices)  # ['00', '200', '00'] (注意边界情况)

2 反向引用

import re

# 查找重复的单词
text = "这是是一个测试测试文本"
duplicates = re.findall(r"(\w+)\1", text)
print(duplicates)  # ['是', '测试']

# 替换中使用反向引用
text = "2024-01-15"
# 将日期格式从YYYY-MM-DD改为DD/MM/YYYY
new_date = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\3/\2/\1", text)
print(new_date)  # 15/01/2024

3. 邮箱验证

import re

def validate_email(email):
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return re.match(pattern, email) is not None

emails = [
    "user@example.com",
    "test.email+tag@domain.co.uk",
    "invalid.email",
    "@domain.com",
    "user@"
]

for email in emails:
    result = "有效" if validate_email(email) else "无效"
    print(f"{email}: {result}")

4. 提取网页链接

import re

html_content = '''
<a href="https://www.python.org">Python官网</a>
<a href="http://www.example.com">示例网站</a>
<a href="/local/path">本地链接</a>
'''

# 提取所有链接
links = re.findall(r'href=["\']([^"\']*)["\']', html_content)
print("所有链接:", links)

# 只提取HTTP/HTTPS链接
http_links = re.findall(r'href=["\']https?://([^"\']*)["\']', html_content)
print("HTTP链接:", http_links)

5. 文本清理和格式化

import re

def clean_text(text):
    # 移除多余的空白字符
    text = re.sub(r'\s+', ' ', text)
    
    # 移除HTML标签
    text = re.sub(r'<[^>]+>', '', text)
    
    # 标准化标点符号周围的空格
    text = re.sub(r'\s*([,.!?;:])\s*', r'\1 ', text)
    
    # 移除行首行尾空格
    text = text.strip()
    
    return text

messy_text = """
    这是    一个    <b>测试</b>    文本   。
    包含了   多余的空格    和   HTML标签  !
"""

cleaned = clean_text(messy_text)
print(f"清理后: {cleaned}")
# 输出: 这是 一个 测试 文本. 包含了 多余的空格 和 HTML标签!

6. 日志分析

import re
from collections import Counter

log_content = '''
192.168.1.1 - - [10/Jan/2024:13:55:36] "GET /index.html HTTP/1.1" 200 2326
192.168.1.2 - - [10/Jan/2024:13:55:37] "POST /api/login HTTP/1.1" 401 23
192.168.1.1 - - [10/Jan/2024:13:55:38] "GET /dashboard HTTP/1.1" 200 4532
192.168.1.3 - - [10/Jan/2024:13:55:39] "GET /index.html HTTP/1.1" 404 162
'''

# 提取IP地址
ip_pattern = r'^(\d+\.\d+\.\d+\.\d+)'
ips = re.findall(ip_pattern, log_content, re.MULTILINE)
ip_counter = Counter(ips)
print("IP访问统计:", dict(ip_counter))

# 提取HTTP状态码
status_pattern = r'" (\d{3}) '
statuses = re.findall(status_pattern, log_content)
status_counter = Counter(statuses)
print("状态码统计:", dict(status_counter))

# 提取请求的URL
url_pattern = r'"[A-Z]+ ([^\s]+) HTTP'
urls = re.findall(url_pattern, log_content)
url_counter = Counter(urls)
print("URL访问统计:", dict(url_counter))

7. 编译复用

import re
import time

text = "test@example.com" * 10000
texts = [text] * 1000

# 方法1:每次都编译(较慢)
start_time = time.time()
for text in texts:
    re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
method1_time = time.time() - start_time

# 方法2:预编译(较快)
pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
start_time = time.time()
for text in texts:
    pattern.findall(text)
method2_time = time.time() - start_time

print(f"未编译耗时: {method1_time:.4f}秒")
print(f"预编译耗时: {method2_time:.4f}秒")
print(f"性能提升: {method1_time/method2_time:.2f}倍")

8. 选择合适的函数

import re

text = "这里有很多数字:123, 456, 789, 101112"

# 如果只需要第一个匹配,使用search而不是findall
first_number = re.search(r'\d+', text).group()
print(f"第一个数字: {first_number}")

# 如果需要检查是否匹配,使用search而不是match(除非确定从开头匹配)
has_number = re.search(r'\d+', text) is not None
print(f"包含数字: {has_number}")

9. 转义问题

import re

# 错误:没有正确转义
text = "C:\\Users\\Python\\test.py"
# 错误的模式
wrong_pattern = "C:\Users"  # \U会被解释为Unicode转义

# 正确的方法:使用原始字符串
correct_pattern = r"C:\\Users"
# 或者双重转义
correct_pattern2 = "C:\\\\Users"

print(re.search(correct_pattern, text).group())   # C:\Users
print(re.search(correct_pattern2, text).group())  # C:\Users

10. 调试正则表达式

import re

def debug_regex(pattern, text):
    """调试正则表达式的辅助函数"""
    compiled = re.compile(pattern)
    print(f"模式: {pattern}")
    print(f"文本: {text}")
    
    match = compiled.search(text)
    if match:
        print(f"匹配成功: {match.group()}")
        print(f"位置: {match.start()}-{match.end()}")
        if match.groups():
            print(f"分组: {match.groups()}")
    else:
        print("未找到匹配")
    print("-" * 40)

# 调试示例
debug_regex(r"\d{3}-\d{4}-\d{4}", "电话:138-1234-5678")
debug_regex(r"(\w+)@(\w+\.\w+)", "邮箱:user@example.com")