Python 正则表达式

正则表达式（Regular Expression，简称 regex 或 regexp）是一种强大的文本模式匹配工具。它使用特殊的语法来描述和匹配字符串中的模式，广泛应用于文本搜索、替换、验证和提取等场景。在Python中，正则表达式通过内置的re模块来实现。

正则表达式看似复杂，实则很简单，如果大家在学习的过程中有疑惑的，可以使用作者开发的正则表达式可使用工具，它能实时可视化你写的正则是匹配到哪些字符了

黄金工具箱-正则表达式可视化

基础语法和元字符

普通字符

普通字符就是字面意思，直接匹配对应的字符。

import re

text = "Hello World"
pattern = "Hello"
result = re.search(pattern, text)
print(result.group())  # 输出: Hello

常用元字符

元字符	含义	示例
`.`	匹配除换行符外的任意字符	`a.c` 匹配 "abc", "a1c"
`^`	匹配字符串开头	`^Hello` 匹配以"Hello"开头的字符串
`$`	匹配字符串结尾	`end$` 匹配以"end"结尾的字符串
`*`	匹配前面字符0次或多次	`ab*` 匹配 "a", "ab", "abbb"
`+`	匹配前面字符1次或多次	`ab+` 匹配 "ab", "abbb"，不匹配 "a"
`?`	匹配前面字符0次或1次	`ab?` 匹配 "a", "ab"
`\`	转义字符	`\.` 匹配字面意思的点号

import re

# 点号匹配任意字符
text = "cat bat rat"
pattern = r".at"
matches = re.findall(pattern, text)
print(matches)  # 输出: ['cat', 'bat', 'rat']

# 开头和结尾匹配
text = "Hello World"
print(re.search(r"^Hello", text) is not None)  # True
print(re.search(r"World$", text) is not None)  # True

# 量词使用
text = "a ab abb abbb"
print(re.findall(r"ab*", text))   # ['a', 'ab', 'abb', 'abbb']
print(re.findall(r"ab+", text))   # ['ab', 'abb', 'abbb']
print(re.findall(r"ab?", text))   # ['a', 'ab', 'ab', 'ab']

字符类

import re

text = "Hello123 World456"

# \d 匹配数字
numbers = re.findall(r"\d", text)
print(numbers)  # ['1', '2', '3', '4', '5', '6']

# \w 匹配字母、数字、下划线
words = re.findall(r"\w+", text)
print(words)  # ['Hello123', 'World456']

# \s 匹配空白字符
spaces = re.findall(r"\s", text)
print(spaces)  # [' ']

# 自定义字符类
vowels = re.findall(r"[aeiou]", "Hello World")
print(vowels)  # ['e', 'o', 'o']

# 字符范围
letters = re.findall(r"[a-z]+", "Hello World 123")
print(letters)  # ['ello', 'orld']

Python中的re模块

导入和基本使用

import re

# 基本匹配
text = "Python是一门编程语言"
pattern = r"Python"
if re.search(pattern, text):
    print("找到匹配")
else:
    print("未找到匹配")

re模块的主要函数

re.search() - 查找第一个匹配

import re

text = "联系电话：138-1234-5678，备用电话：139-8765-4321"
pattern = r"\d{3}-\d{4}-\d{4}"
match = re.search(pattern, text)
if match:
    print(f"找到电话号码: {match.group()}")  # 138-1234-5678
    print(f"位置: {match.start()}-{match.end()}")  # 位置: 4-16

re.match() - 从字符串开头匹配

import re

text = "Python编程"
# match只从字符串开头匹配
result1 = re.match(r"Python", text)
print(result1.group() if result1 else "未匹配")  # Python

result2 = re.match(r"编程", text)
print(result2.group() if result2 else "未匹配")  # 未匹配

re.findall() - 查找所有匹配

import re

text = "今天是2024年1月15日，明天是2024年1月16日"
dates = re.findall(r"\d{4}年\d{1,2}月\d{1,2}日", text)
print(dates)  # ['2024年1月15日', '2024年1月16日']

# 返回元组（有分组时）
emails = "联系邮箱：user1@example.com，备用邮箱：user2@gmail.com"
email_parts = re.findall(r"(\w+)@(\w+\.\w+)", emails)
print(email_parts)  # [('user1', 'example.com'), ('user2', 'gmail.com')]

re.finditer() - 返回匹配对象的迭代器

import re

text = "价格：¥100，折扣价：¥80，VIP价：¥60"
pattern = r"¥(\d+)"
for match in re.finditer(pattern, text):
    print(f"价格: {match.group(1)}, 位置: {match.start()}-{match.end()}")
# 输出:
# 价格: 100, 位置: 2-6
# 价格: 80, 位置: 11-14
# 价格: 60, 位置: 20-23

re.sub() - 替换匹配的内容

import re

# 基本替换
text = "今天天气很好，明天天气也不错"
new_text = re.sub(r"天气", "心情", text)
print(new_text)  # 今天心情很好，明天心情也不错

# 使用函数进行替换
def price_converter(match):
    price = int(match.group(1))
    return f"${price * 0.15:.2f}"

text = "商品价格：¥100，特价：¥80"
converted = re.sub(r"¥(\d+)", price_converter, text)
print(converted)  # 商品价格：$15.00，特价：$12.00

re.split() - 按模式分割字符串

import re

# 基本分割
text = "苹果,香蕉;橙子|葡萄"
fruits = re.split(r"[,;|]", text)
print(fruits)  # ['苹果', '香蕉', '橙子', '葡萄']

# 保留分隔符
text = "第一章。第二章！第三章？"
chapters = re.split(r"([。！？])", text)
print(chapters)  # ['第一章', '。', '第二章', '！', '第三章', '？', '']

分组和捕获

基本分组

import re

# 使用圆括号创建分组
text = "我的生日是1990年5月20日"
pattern = r"(\d{4})年(\d{1,2})月(\d{1,2})日"
match = re.search(pattern, text)

if match:
    print(f"完整匹配: {match.group(0)}")  # 1990年5月20日
    print(f"年份: {match.group(1)}")      # 1990
    print(f"月份: {match.group(2)}")      # 5
    print(f"日期: {match.group(3)}")      # 20
    print(f"所有分组: {match.groups()}")   # ('1990', '5', '20')

命名分组

import re

text = "张三的邮箱是zhangsan@example.com"
pattern = r"(?P<name>\w+)的邮箱是(?P<email>\w+@\w+\.\w+)"
match = re.search(pattern, text)

if match:
    print(f"姓名: {match.group('name')}")    # 张三
    print(f"邮箱: {match.group('email')}")   # zhangsan@example.com
    print(f"字典形式: {match.groupdict()}")   # {'name': '张三', 'email': 'zhangsan@example.com'}

非捕获分组

import re

# (?:...) 表示非捕获分组，不会被保存
text = "http://www.example.com 和 https://www.google.com"
pattern = r"(?:http|https)://(\w+\.\w+\.\w+)"
matches = re.findall(pattern, text)
print(matches)  # ['www.example.com', 'www.google.com']

# 对比：使用捕获分组
pattern_with_capture = r"(http|https)://(\w+\.\w+\.\w+)"
matches_with_capture = re.findall(pattern_with_capture, text)
print(matches_with_capture)  # [('http', 'www.example.com'), ('https', 'www.google.com')]

贪婪匹配与非贪婪匹配

贪婪匹配（默认行为）

import re

text = '<div>内容1</div><div>内容2</div>'

# 贪婪匹配：尽可能多地匹配
greedy_pattern = r"<div>.*</div>"
greedy_match = re.search(greedy_pattern, text)
print(f"贪婪匹配: {greedy_match.group()}")
# 输出: <div>内容1</div><div>内容2</div>

非贪婪匹配

import re

text = '<div>内容1</div><div>内容2</div>'

# 非贪婪匹配：尽可能少地匹配
non_greedy_pattern = r"<div>.*?</div>"
non_greedy_matches = re.findall(non_greedy_pattern, text)
print(f"非贪婪匹配: {non_greedy_matches}")
# 输出: ['<div>内容1</div>', '<div>内容2</div>']

量词的贪婪与非贪婪形式

import re

text = "aaaa"

print("贪婪量词:")
print(re.findall(r"a+", text))    # ['aaaa']
print(re.findall(r"a*", text))    # ['aaaa', '']
print(re.findall(r"a{2,4}", text))  # ['aaaa']

print("\n非贪婪量词:")
print(re.findall(r"a+?", text))   # ['a', 'a', 'a', 'a']
print(re.findall(r"a*?", text))   # ['', '', '', '', '']
print(re.findall(r"a{2,4}?", text))  # ['aa']

编译正则表达式

为什么要编译

当需要多次使用同一个正则表达式时，预先编译可以提高性能。

import re

# 编译正则表达式
phone_pattern = re.compile(r"(\d{3})-(\d{4})-(\d{4})")

text1 = "我的电话是138-1234-5678"
text2 = "他的电话是139-8765-4321"

# 使用编译后的模式
match1 = phone_pattern.search(text1)
match2 = phone_pattern.search(text2)

print(f"电话1: {match1.group()}")  # 138-1234-5678
print(f"电话2: {match2.group()}")  # 139-8765-4321

编译标志

import re

text = "Hello WORLD"

# 忽略大小写
pattern = re.compile(r"hello", re.IGNORECASE)
print(pattern.search(text).group())  # Hello

# 多行模式
multiline_text = """第一行
第二行
第三行"""

# 使用MULTILINE标志
pattern1 = re.compile(r"^第二行", re.MULTILINE)
print(pattern1.search(multiline_text).group())  # 第二行

# 点号匹配所有字符（包括换行符）
pattern2 = re.compile(r"第一行.*第三行", re.DOTALL)
print(pattern2.search(multiline_text).group())  # 第一行\n第二行\n第三行

Python 安装

Python 基础

Python 函数

Python 迭代器

Python 模块

Python 面向对象

Python 错误处理

Python 代码测试

Python 代码调试

Python IO 编程

Python 进程与线程

Python 正则表达式

Python 常用内置模块

Python 常用三方模块

Python 图形界面

Python 网络编程

Python 电子邮件

Python 数据库使用

Python Web开发

Python 异步 IO

Python 正则表达式

基础语法和元字符

普通字符

常用元字符

字符类

Python中的re模块

导入和基本使用

re模块的主要函数

re.search() - 查找第一个匹配

re.match() - 从字符串开头匹配

re.findall() - 查找所有匹配

re.finditer() - 返回匹配对象的迭代器

re.sub() - 替换匹配的内容

re.split() - 按模式分割字符串

分组和捕获

基本分组

命名分组

非捕获分组

贪婪匹配与非贪婪匹配

贪婪匹配（默认行为）

非贪婪匹配

量词的贪婪与非贪婪形式

编译正则表达式

为什么要编译

编译标志

文章目录