yolov10/docs/update_translations.py

# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
Script to fix broken Markdown links and front matter in language-specific directories zh, ko, ja, ru, de, fr, es, pt.

This script processes markdown files in language-specific directories (like /zh/). It finds Markdown links and checks
their existence. If a link is broken and does not exist in the language-specific directory but exists in the /en/
directory, the script updates the link to point to the corresponding file in the /en/ directory.

It also ensures that front matter keywords like 'comments:', 'description:', and 'keywords:' are not translated and
remain in English.
"""

import re
from pathlib import Path


class MarkdownLinkFixer:
    """Class to fix Markdown links and front matter in language-specific directories."""

    def __init__(self, base_dir, update_links=True, update_text=True):
        """Initialize the MarkdownLinkFixer with the base directory."""
        self.base_dir = Path(base_dir)
        self.update_links = update_links
        self.update_text = update_text
        self.md_link_regex = re.compile(r"\[([^]]+)]\(([^:)]+)\.md\)")

    @staticmethod
    def replace_front_matter(content, lang_dir):
        """Ensure front matter keywords remain in English."""
        english = ["comments", "description", "keywords"]
        translations = {
            "zh": ["评论", "描述", "关键词"],  # Mandarin Chinese (Simplified) warning, sometimes translates as 关键字
            "es": ["comentarios", "descripción", "palabras clave"],  # Spanish
            "ru": ["комментарии", "описание", "ключевые слова"],  # Russian
            "pt": ["comentários", "descrição", "palavras-chave"],  # Portuguese
            "fr": ["commentaires", "description", "mots-clés"],  # French
            "de": ["kommentare", "beschreibung", "schlüsselwörter"],  # German
            "ja": ["コメント", "説明", "キーワード"],  # Japanese
            "ko": ["댓글", "설명", "키워드"],  # Korean
            "hi": ["टिप्पणियाँ", "विवरण", "कीवर्ड"],  # Hindi
            "ar": ["التعليقات", "الوصف", "الكلمات الرئيسية"],  # Arabic
        }  # front matter translations for comments, description, keyword

        for term, eng_key in zip(translations.get(lang_dir.stem, []), english):
            content = (
                re.sub(rf"{term} *[：:].*", f"{eng_key}: true", content, flags=re.IGNORECASE)
                if eng_key == "comments"
                else re.sub(rf"{term} *[：:] *", f"{eng_key}: ", content, flags=re.IGNORECASE)
            )
        return content

    @staticmethod
    def replace_admonitions(content, lang_dir):
        """Ensure front matter keywords remain in English."""
        english = [
            "Note",
            "Summary",
            "Tip",
            "Info",
            "Success",
            "Question",
            "Warning",
            "Failure",
            "Danger",
            "Bug",
            "Example",
            "Quote",
            "Abstract",
            "Seealso",
            "Admonition",
        ]
        translations = {
            "en": english,
            "zh": [
                "笔记",
                "摘要",
                "提示",
                "信息",
                "成功",
                "问题",
                "警告",
                "失败",
                "危险",
                "故障",
                "示例",
                "引用",
                "摘要",
                "另见",
                "警告",
            ],
            "es": [
                "Nota",
                "Resumen",
                "Consejo",
                "Información",
                "Éxito",
                "Pregunta",
                "Advertencia",
                "Fracaso",
                "Peligro",
                "Error",
                "Ejemplo",
                "Cita",
                "Abstracto",
                "Véase También",
                "Amonestación",
            ],
            "ru": [
                "Заметка",
                "Сводка",
                "Совет",
                "Информация",
                "Успех",
                "Вопрос",
                "Предупреждение",
                "Неудача",
                "Опасность",
                "Ошибка",
                "Пример",
                "Цитата",
                "Абстракт",
                "См. Также",
                "Предостережение",
            ],
            "pt": [
                "Nota",
                "Resumo",
                "Dica",
                "Informação",
                "Sucesso",
                "Questão",
                "Aviso",
                "Falha",
                "Perigo",
                "Bug",
                "Exemplo",
                "Citação",
                "Abstrato",
                "Veja Também",
                "Advertência",
            ],
            "fr": [
                "Note",
                "Résumé",
                "Conseil",
                "Info",
                "Succès",
                "Question",
                "Avertissement",
                "Échec",
                "Danger",
                "Bug",
                "Exemple",
                "Citation",
                "Abstrait",
                "Voir Aussi",
                "Admonestation",
            ],
            "de": [
                "Hinweis",
                "Zusammenfassung",
                "Tipp",
                "Info",
                "Erfolg",
                "Frage",
                "Warnung",
                "Ausfall",
                "Gefahr",
                "Fehler",
                "Beispiel",
                "Zitat",
                "Abstrakt",
                "Siehe Auch",
                "Ermahnung",
            ],
            "ja": [
                "ノート",
                "要約",
                "ヒント",
                "情報",
                "成功",
                "質問",
                "警告",
                "失敗",
                "危険",
                "バグ",
                "例",
                "引用",
                "抄録",
                "参照",
                "訓告",
            ],
            "ko": [
                "노트",
                "요약",
                "팁",
                "정보",
                "성공",
                "질문",
                "경고",
                "실패",
                "위험",
                "버그",
                "예제",
                "인용",
                "추상",
                "참조",
                "경고",
            ],
            "hi": [
                "नोट",
                "सारांश",
                "सुझाव",
                "जानकारी",
                "सफलता",
                "प्रश्न",
                "चेतावनी",
                "विफलता",
                "खतरा",
                "बग",
                "उदाहरण",
                "उद्धरण",
                "सार",
                "देखें भी",
                "आगाही",
            ],
            "ar": [
                "ملاحظة",
                "ملخص",
                "نصيحة",
                "معلومات",
                "نجاح",
                "سؤال",
                "تحذير",
                "فشل",
                "خطر",
                "عطل",
                "مثال",
                "اقتباس",
                "ملخص",
                "انظر أيضاً",
                "تحذير",
            ],
        }

        for term, eng_key in zip(translations.get(lang_dir.stem, []), english):
            if lang_dir.stem != "en":
                content = re.sub(rf"!!! *{eng_key} *\n", f'!!! {eng_key} "{term}"\n', content, flags=re.IGNORECASE)
                content = re.sub(rf"!!! *{term} *\n", f'!!! {eng_key} "{term}"\n', content, flags=re.IGNORECASE)
            content = re.sub(rf"!!! *{term}", f"!!! {eng_key}", content, flags=re.IGNORECASE)
            content = re.sub(r'!!! *"', '!!! Example "', content, flags=re.IGNORECASE)

        return content

    @staticmethod
    def update_iframe(content):
        """Update the 'allow' attribute of iframe if it does not contain the specific English permissions."""
        english = "accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
        pattern = re.compile(f'allow="(?!{re.escape(english)}).+?"')
        return pattern.sub(f'allow="{english}"', content)

    def link_replacer(self, match, parent_dir, lang_dir, use_abs_link=False):
        """Replace broken links with corresponding links in the /en/ directory."""
        text, path = match.groups()
        linked_path = (parent_dir / path).resolve().with_suffix(".md")

        if not linked_path.exists():
            en_linked_path = Path(str(linked_path).replace(str(lang_dir), str(lang_dir.parent / "en")))
            if en_linked_path.exists():
                if use_abs_link:
                    # Use absolute links WARNING: BUGS, DO NOT USE
                    docs_root_relative_path = en_linked_path.relative_to(lang_dir.parent)
                    updated_path = str(docs_root_relative_path).replace("en/", "/../")
                else:
                    # Use relative links
                    steps_up = len(parent_dir.relative_to(self.base_dir).parts)
                    updated_path = Path("../" * steps_up) / en_linked_path.relative_to(self.base_dir)
                    updated_path = str(updated_path).replace("/en/", "/")

                print(f"Redirecting link '[{text}]({path})' from {parent_dir} to {updated_path}")
                return f"[{text}]({updated_path})"
            else:
                print(f"Warning: Broken link '[{text}]({path})' found in {parent_dir} does not exist in /docs/en/.")

        return match.group(0)

    @staticmethod
    def update_html_tags(content):
        """Updates HTML tags in docs."""
        alt_tag = "MISSING"

        # Remove closing slashes from self-closing HTML tags
        pattern = re.compile(r"<([^>]+?)\s*/>")
        content = re.sub(pattern, r"<\1>", content)

        # Find all images without alt tags and add placeholder alt text
        pattern = re.compile(r"!\[(.*?)\]\((.*?)\)")
        content, num_replacements = re.subn(
            pattern, lambda match: f"![{match.group(1) or alt_tag}]({match.group(2)})", content
        )

        # Add missing alt tags to HTML images
        pattern = re.compile(r'<img\s+(?!.*?\balt\b)[^>]*src=["\'](.*?)["\'][^>]*>')
        content, num_replacements = re.subn(
            pattern, lambda match: match.group(0).replace(">", f' alt="{alt_tag}">', 1), content
        )

        return content

    def process_markdown_file(self, md_file_path, lang_dir):
        """Process each markdown file in the language directory."""
        print(f"Processing file: {md_file_path}")
        with open(md_file_path, encoding="utf-8") as file:
            content = file.read()

        if self.update_links:
            content = self.md_link_regex.sub(lambda m: self.link_replacer(m, md_file_path.parent, lang_dir), content)

        if self.update_text:
            content = self.replace_front_matter(content, lang_dir)
            content = self.replace_admonitions(content, lang_dir)
            content = self.update_iframe(content)
            content = self.update_html_tags(content)

        with open(md_file_path, "w", encoding="utf-8") as file:
            file.write(content)

    def process_language_directory(self, lang_dir):
        """Process each language-specific directory."""
        print(f"Processing language directory: {lang_dir}")
        for md_file in lang_dir.rglob("*.md"):
            self.process_markdown_file(md_file, lang_dir)

    def run(self):
        """Run the link fixing and front matter updating process for each language-specific directory."""
        for subdir in self.base_dir.iterdir():
            if subdir.is_dir() and re.match(r"^\w\w$", subdir.name):
                self.process_language_directory(subdir)


if __name__ == "__main__":
    # Set the path to your MkDocs 'docs' directory here
    docs_dir = str(Path(__file__).parent.resolve())
    fixer = MarkdownLinkFixer(docs_dir, update_links=True, update_text=True)
    fixer.run()