2 viikkoa sitten · b5cf40acf1
--- a/common/utils.py
+++ b/common/utils.py
@@ -453,36 +453,81 @@ def check_chatroom(userName):
    
 #     return text

 def remove_markdown_symbol(text: str):
  # 去除标题
    text = re.sub(r'#+\s*', '', text)
 # def remove_markdown_symbol(text: str):
 #   # 去除标题
 #     text = re.sub(r'#+\s*', '', text)
    
    # 去除粗体和斜体
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
    text = re.sub(r'__([^_]+)__', r'\1', text)
    text = re.sub(r'\*([^*]+)\*', r'\1', text)
    text = re.sub(r'_([^_]+)_', r'\1', text)
 #     # 去除粗体和斜体
 #     text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
 #     text = re.sub(r'__([^_]+)__', r'\1', text)
 #     text = re.sub(r'\*([^*]+)\*', r'\1', text)
 #     text = re.sub(r'_([^_]+)_', r'\1', text)
    
 #     # 保留链接地址
 #     text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
    
 #     # 保留图片地址
 #     text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
    
 #     # 去除列表
 #     text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE)
 #     text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
    
 #     # 去除代码块和内联代码
 #     text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
 #     text = re.sub(r'`([^`]+)`', r'\1', text)
    
    # 保留链接地址
    text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
 #     # 去除引用
 #     text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
    
    # 保留图片地址
    text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'\2', text)
 #     # 去除水平线
 #     text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
    
 #     return text.strip()

 def remove_markdown_symbol(text: str)->str:
    url_placeholders = []

    def url_replacer(url):
        url_placeholders.append(url)
        return f"[[URL{len(url_placeholders)-1}]]"

    # 先处理 Markdown 图片语法: ![alt](url)
    text = re.sub(r'!\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text)

    # 处理 Markdown 链接语法: [text](url)
    text = re.sub(r'\[[^\]]*?\]\((https?://[^\s)]+)\)', lambda m: url_replacer(m.group(1)), text)

    # 再处理裸链接
    text = re.sub(r'https?://[^\s)]+', lambda m: url_replacer(m.group(0)), text)

    # 去除标题
    text = re.sub(r'^\s*#+\s*', '', text, flags=re.MULTILINE)

    # 去除粗体、斜体（现在不会破坏 URL 中的 _ 了）
    text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
    text = re.sub(r'__([^_]+)__', r'\1', text)
    text = re.sub(r'\*([^\*]+)\*', r'\1', text)
    text = re.sub(r'_([^_]+)_', r'\1', text)

    # 去除列表
    text = re.sub(r'^\s*[\*\+\-]\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
    

    # 去除代码块和内联代码
    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    

    # 去除引用
    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
    

    # 去除水平线
    text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
    

    #  恢复 URL
    for i, url in enumerate(url_placeholders):
        text = text.replace(f"[[URL{i}]]", url)

    return text.strip()