Selaa lähdekoodia

调整去md

1257
H Vs 2 viikkoa sitten
vanhempi
commit
b5cf40acf1
1 muutettua tiedostoa jossa 62 lisäystä ja 17 poistoa
  1. +62
    -17
      common/utils.py

+ 62
- 17
common/utils.py Näytä tiedosto

@@ -453,36 +453,81 @@ def check_chatroom(userName):
# return text

def remove_markdown_symbol(text: str):
# 去除标题
text = re.sub(r'#+\s*', '', text)
# def remove_markdown_symbol(text: str):
# # 去除标题
# text = re.sub(r'#+\s*', '', text)
# 去除粗体和斜体
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
text = re.sub(r'__([^_]+)__', r'\1', text)
text = re.sub(r'\*([^*]+)\*', r'\1', text)
text = re.sub(r'_([^_]+)_', r'\1', text)
# # 去除粗体和斜体
# text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
# text = re.sub(r'__([^_]+)__', r'\1', text)
# text = re.sub(r'\*([^*]+)\*', r'\1', text)
# text = re.sub(r'_([^_]+)_', r'\1', text)
# # 保留链接地址
# text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
# # 保留图片地址
# text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
# # 去除列表
# text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE)
# text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
# # 去除代码块和内联代码
# text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
# text = re.sub(r'`([^`]+)`', r'\1', text)
# 保留链接地址
text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
# # 去除引用
# text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
# 保留图片地址
text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
# # 去除水平线
# text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
# return text.strip()

def remove_markdown_symbol(text: str)->str:
url_placeholders = []

def url_replacer(url):
url_placeholders.append(url)
return f"[[URL{len(url_placeholders)-1}]]"

# 先处理 Markdown 图片语法: ![alt](url)
text = re.sub(r'!\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text)

# 处理 Markdown 链接语法: [text](url)
text = re.sub(r'\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text)

# 再处理裸链接
text = re.sub(r'https?://[^\s)]+', lambda m: url_replacer(m.group(0)), text)

# 去除标题
text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE)

# 去除粗体、斜体(现在不会破坏 URL 中的 _ 了)
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
text = re.sub(r'__([^_]+)__', r'\1', text)
text = re.sub(r'\*([^\*]+)\*', r'\1', text)
text = re.sub(r'_([^_]+)_', r'\1', text)

# 去除列表
text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
# 去除代码块和内联代码
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
text = re.sub(r'`([^`]+)`', r'\1', text)
# 去除引用
text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
# 去除水平线
text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)

# 恢复 URL
for i, url in enumerate(url_placeholders):
text = text.replace(f"[[URL{i}]]", url)

return text.strip()




Loading…
Peruuta
Tallenna