我在一个正则表达式(在Python3.9中)遇到了麻烦,我已经挖掘了好几天,仍然被阻止.
下面是这样的表达方式:
^(?P<head>(?<=^).+?)??(?P<mid>(?:(?<=^)|[.])x+)?(?P<tail>[.][^/]+?)?(?<!^)$
这与以下内容匹配:
xx # works, goes in mid
a.xx # works, head "a", mid ".xx"
a.b.xx # FAILS, head "a", tails consumes the rest, why?
a.xx.c # works, head "a", mid ".xx", tail ".c"
a.b.xx.c # FAILS, head "a", tails consumes the rest, why?
- A.b.xx:我需要"A.B"作为头部,".xx"作为中间,没有 tail
- A.b.xx.c:我需要"A.B"作为头部,".xx"作为中间,".C"作为尾部
我不明白为什么在某些情况下 tail 会吃掉中间部分,而在某些情况下不会,你知道吗?你知道我怎么才能总是把".c"放在 tail 吗?非常感谢!
你能用语言描述一下你的正则表达式应该匹配什么吗?
这适用于特定于视觉效果的文件序列模式,例如:
- 描述.详细信息.[UDIM令牌或瓷砖编号].[帧编号].exr
- description.detail. [UDIM令牌或瓦片编号].exr
- Description.Detail.[帧编号].exr
- 描述.Detail.u<;udim>;.@.exr
- Description.Detail.u1001.@.exr
- Description.detail.u1001.1001.exr
- 描述.Detail.u<;udim>;.exr
- Description.detail.U1001.exr
- 描述.Detail.@.exr
- Description.detail.1001.exr
- U<;Udim&>.exr
- U1001.exr
- _
- 1001.exr
所以我们实际上有这样的群体:
- 可选标题,文件序列的一般描述
- 分隔符作为点,但这些可以包括在头部多次
- 作为标记或数字的UDIM部分,前面始终为"u"
- 作为填充令牌或数字的帧编号部分
- 尾部,这里是文件扩展名,但可以更多
- 我们需要使用UDIM、帧或两者进行检测,它们始终按此顺序排列
我把它简化到重现问题行为所需的最低限度,"MID"组可以是UDIM或框架.
更多详细信息,实际正则表达式:
import os
from aenum import LowerStrEnum, auto
class GroupName(LowerStrEnum):
"""Regex group name."""
FRAME_NUMBER = auto()
FRAME_SEPARATOR = auto()
FRAME_TOKEN = auto()
UDIM_SEPARATOR = auto()
UDIM_TILE = auto()
UDIM_TOKEN = auto()
def __str__(self) -> str:
return self.value
# Separator for pattern sections.
SEPARATOR = r"[_.]"
# Start or separator.
START_OR_SEPARATOR = rf"(?<=^)|{SEPARATOR}"
# UDIM tile.
UDIM_TILE = rf"(?P<{GroupName.UDIM_TILE}>[\d#@]{{4}})"
# UDIM or UDIM token.
UDIM_TOKEN = rf"(?P<{GroupName.UDIM_TOKEN}><[Uu][Dd][Ii][Mm]>)"
# Padding: '#', '###', '@', '@@', '%d', '%03d', '$F', '$F4'
PADDING = r"#+|@+|%(?:\d+)?d|\$F(?:\d+)?"
# Frame range: '1-2' '1,2' '1, 2' '1-2, 3' '1-2,3' '1,2-3'
FRAME_RANGE = r"-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?"
# Frame token (range/padding) regex.
FRAME_TOKEN = (
rf"(?P<{GroupName.FRAME_TOKEN}>"
r"(?:"
r"(?P<joined_embedded_frame_range>"
rf"(?:{FRAME_RANGE})"
r"|(?:-?\d+)"
r")"
rf"(?P<joined_padding>{PADDING})"
r")"
rf"|(?P<embedded_frame_range>{FRAME_RANGE})"
rf"|(?P<padding>{PADDING})"
r")"
)
# Base for all types of paths.
BASE = (
r"^"
# Head group.
r"(?:"
rf"(?P<directory>(?<=^).*{os.path.sep})?"
rf"(?P<head>[^{os.path.sep}]+?)"
r"(?!$)" # Stop the head from consuming everything next.
r")??"
# UDIM group.
rf"(?:"
rf"(?P<{GroupName.UDIM_SEPARATOR}>{START_OR_SEPARATOR})"
rf"[uU](?:{UDIM_TILE}|{UDIM_TOKEN})"
r")?"
# Frame sequence group.
r"(?:"
rf"(?P<{GroupName.FRAME_SEPARATOR}>{START_OR_SEPARATOR})"
r"(?:"
rf"{FRAME_TOKEN}"
rf"|(?P<{GroupName.FRAME_NUMBER}>-?\d+)"
r")"
r")?"
# Tail & ext group.
r"(?:"
rf"(?P<tail>{SEPARATOR}[^/]+)?"
rf"(?P<ext>{SEPARATOR}[a-zA-Z]+)"
r")?"
rf"(?: \[*(?P<extended_frame_range>{FRAME_RANGE})\]*?)?"
r"(?<!^)$"
)
^(?:(?P<directory>(?<=^).*/)?(?P<head>[^/]+?)(?!$))??(?:(?P<udim_separator>(?<=^)|[_.])[uU](?:(?P<udim_tile>[\d#@]{4})|(?P<udim_token><[Uu][Dd][Ii][Mm]>)))?(?:(?P<frame_separator>(?<=^)|[_.])(?:(?P<frame_token>(?:(?P<joined_embedded_frame_range>(?:-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?:-?\d+))(?P<joined_padding>#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<embedded_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)|(?P<padding>#+|@+|%(?:\d+)?d|\$F(?:\d+)?))|(?P<frame_number>-?\d+)))?(?:(?P<tail>[_.][^/]+)?(?P<ext>[_.][a-zA-Z]+))?(?: \[*(?P<extended_frame_range>-?\d+(?:[-,(?:, )]+-?\d+)+(x\d)?)\]*?)?(?<!^)$
打印以下字符串的分组字典:
tests/test_paths/udim_seq/head.1001
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': None,
'extended_frame_range': None,
'frame_number': '1001',
'frame_separator': '.',
'frame_token': None,
'head': 'head',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': None,
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
tests/test_paths/udim_seq/book.open.mid.u1001.1-2#
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': None,
'extended_frame_range': None,
'frame_number': None,
'frame_separator': '.',
'frame_token': '1-2#',
'head': 'book.open.mid',
'joined_embedded_frame_range': '1-2',
'joined_padding': '#',
'padding': None,
'tail': None,
'udim_separator': '.',
'udim_tile': '1001',
'udim_token': None}
tests/test_paths/udim_seq/head.1001.exr
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': '.exr',
'extended_frame_range': None,
'frame_number': '1001',
'frame_separator': '.',
'frame_token': None,
'head': 'head',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': None,
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
tests/test_paths/udim_seq/book.open.mid.1001.exr
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': '.exr',
'extended_frame_range': None,
'frame_number': None,
'frame_separator': None,
'frame_token': None,
'head': 'book',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': '.open.mid.1001',
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
tests/test_paths/udim_seq/book.open.mid.u1001.1-2#.exr
{'directory': 'tests/test_paths/udim_seq/',
'embedded_frame_range': None,
'ext': '.exr',
'extended_frame_range': None,
'frame_number': None,
'frame_separator': None,
'frame_token': None,
'head': 'book',
'joined_embedded_frame_range': None,
'joined_padding': None,
'padding': None,
'tail': '.open.mid.u1001.1-2#',
'udim_separator': None,
'udim_tile': None,
'udim_token': None}
一旦头部包括分隔符, tail 就会吃掉一切...