Python 正则表达式

Python 正则表达式高级应用

本章节主要讲解 Python正则表达式的前瞻和后顾断言，反向引用，邮箱验证，提取网页链接，文本清理和格式化，日志分析，编译复用，选择合适的函数，转义问题，调试正则表达式。

1. 前瞻和后顾断言

import re

# 正向前瞻 (?=...)
text = "Java JavaScript Python"
# 匹配后面跟着Script的Java
java_before_script = re.findall(r"Java(?=Script)", text)
print(java_before_script)  # ['Java']

# 负向前瞻 (?!...)
# 匹配后面不跟着Script的Java
java_not_before_script = re.findall(r"Java(?!Script)", text)
print(java_not_before_script)  # ['Java'] (来自独立的Java)

# 正向后顾 (?<=...)
prices = "¥100 $200 €300"
# 匹配前面是¥的数字
rmb_prices = re.findall(r"(?<=¥)\d+", prices)
print(rmb_prices)  # ['100']

# 负向后顾 (?<!...)
# 匹配前面不是¥的数字
non_rmb_prices = re.findall(r"(?<!¥)\d+", prices)
print(non_rmb_prices)  # ['00', '200', '00'] (注意边界情况)

2 反向引用

import re

# 查找重复的单词
text = "这是是一个测试测试文本"
duplicates = re.findall(r"(\w+)\1", text)
print(duplicates)  # ['是', '测试']

# 替换中使用反向引用
text = "2024-01-15"
# 将日期格式从YYYY-MM-DD改为DD/MM/YYYY
new_date = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\3/\2/\1", text)
print(new_date)  # 15/01/2024

3. 邮箱验证

import re

def validate_email(email):
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return re.match(pattern, email) is not None

emails = [
    "user@example.com",
    "test.email+tag@domain.co.uk",
    "invalid.email",
    "@domain.com",
    "user@"
]

for email in emails:
    result = "有效" if validate_email(email) else "无效"
    print(f"{email}: {result}")

4. 提取网页链接

import re

html_content = '''
<a href="https://www.python.org">Python官网</a>
<a href="http://www.example.com">示例网站</a>
<a href="/local/path">本地链接</a>
'''

# 提取所有链接
links = re.findall(r'href=["\']([^"\']*)["\']', html_content)
print("所有链接:", links)

# 只提取HTTP/HTTPS链接
http_links = re.findall(r'href=["\']https?://([^"\']*)["\']', html_content)
print("HTTP链接:", http_links)

5. 文本清理和格式化

import re

def clean_text(text):
    # 移除多余的空白字符
    text = re.sub(r'\s+', ' ', text)
    
    # 移除HTML标签
    text = re.sub(r'<[^>]+>', '', text)
    
    # 标准化标点符号周围的空格
    text = re.sub(r'\s*([,.!?;:])\s*', r'\1 ', text)
    
    # 移除行首行尾空格
    text = text.strip()
    
    return text

messy_text = """
    这是    一个    <b>测试</b>    文本   。
    包含了   多余的空格    和   HTML标签  ！
"""

cleaned = clean_text(messy_text)
print(f"清理后: {cleaned}")
# 输出: 这是 一个 测试 文本. 包含了 多余的空格 和 HTML标签!

6. 日志分析

import re
from collections import Counter

log_content = '''
192.168.1.1 - - [10/Jan/2024:13:55:36] "GET /index.html HTTP/1.1" 200 2326
192.168.1.2 - - [10/Jan/2024:13:55:37] "POST /api/login HTTP/1.1" 401 23
192.168.1.1 - - [10/Jan/2024:13:55:38] "GET /dashboard HTTP/1.1" 200 4532
192.168.1.3 - - [10/Jan/2024:13:55:39] "GET /index.html HTTP/1.1" 404 162
'''

# 提取IP地址
ip_pattern = r'^(\d+\.\d+\.\d+\.\d+)'
ips = re.findall(ip_pattern, log_content, re.MULTILINE)
ip_counter = Counter(ips)
print("IP访问统计:", dict(ip_counter))

# 提取HTTP状态码
status_pattern = r'" (\d{3}) '
statuses = re.findall(status_pattern, log_content)
status_counter = Counter(statuses)
print("状态码统计:", dict(status_counter))

# 提取请求的URL
url_pattern = r'"[A-Z]+ ([^\s]+) HTTP'
urls = re.findall(url_pattern, log_content)
url_counter = Counter(urls)
print("URL访问统计:", dict(url_counter))

7. 编译复用

import re
import time

text = "test@example.com" * 10000
texts = [text] * 1000

# 方法1：每次都编译（较慢）
start_time = time.time()
for text in texts:
    re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
method1_time = time.time() - start_time

# 方法2：预编译（较快）
pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
start_time = time.time()
for text in texts:
    pattern.findall(text)
method2_time = time.time() - start_time

print(f"未编译耗时: {method1_time:.4f}秒")
print(f"预编译耗时: {method2_time:.4f}秒")
print(f"性能提升: {method1_time/method2_time:.2f}倍")

8. 选择合适的函数

import re

text = "这里有很多数字：123, 456, 789, 101112"

# 如果只需要第一个匹配，使用search而不是findall
first_number = re.search(r'\d+', text).group()
print(f"第一个数字: {first_number}")

# 如果需要检查是否匹配，使用search而不是match（除非确定从开头匹配）
has_number = re.search(r'\d+', text) is not None
print(f"包含数字: {has_number}")

9. 转义问题

import re

# 错误：没有正确转义
text = "C:\\Users\\Python\\test.py"
# 错误的模式
wrong_pattern = "C:\Users"  # \U会被解释为Unicode转义

# 正确的方法：使用原始字符串
correct_pattern = r"C:\\Users"
# 或者双重转义
correct_pattern2 = "C:\\\\Users"

print(re.search(correct_pattern, text).group())   # C:\Users
print(re.search(correct_pattern2, text).group())  # C:\Users

10. 调试正则表达式

import re

def debug_regex(pattern, text):
    """调试正则表达式的辅助函数"""
    compiled = re.compile(pattern)
    print(f"模式: {pattern}")
    print(f"文本: {text}")
    
    match = compiled.search(text)
    if match:
        print(f"匹配成功: {match.group()}")
        print(f"位置: {match.start()}-{match.end()}")
        if match.groups():
            print(f"分组: {match.groups()}")
    else:
        print("未找到匹配")
    print("-" * 40)

# 调试示例
debug_regex(r"\d{3}-\d{4}-\d{4}", "电话：138-1234-5678")
debug_regex(r"(\w+)@(\w+\.\w+)", "邮箱：user@example.com")

Python 安装

Python 基础

Python 函数

Python 迭代器

Python 模块

Python 面向对象

Python 错误处理

Python 代码测试

Python 代码调试

Python IO 编程

Python 进程与线程

Python 正则表达式

Python 常用内置模块

Python 常用三方模块

Python 图形界面

Python 网络编程

Python 电子邮件

Python 数据库使用

Python Web开发

Python 异步 IO

Python 正则表达式高级应用

1. 前瞻和后顾断言

2 反向引用

3. 邮箱验证

4. 提取网页链接

5. 文本清理和格式化

6. 日志分析

7. 编译复用

8. 选择合适的函数

9. 转义问题

10. 调试正则表达式

文章目录