How to Build Smarter Multilingual Text Wrapping with BudouX Through Parsing, HTML Rendering, Model Introspection, and Toy Training


import subprocess, sys
def pip(*pkgs):
   subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])
pip("budoux")


import json, time, textwrap, html, random, re, os, tempfile
from pathlib import Path
import budoux
from IPython.display import HTML, display, Markdown


print(f"✅ BudouX version: {budoux.__version__ if hasattr(budoux,'__version__') else 'installed'}")


def header(title):
   display(Markdown(f"## {title}"))


header("1️⃣ Default parsers — Japanese / Chinese (Simplified & Traditional) / Thai")


samples = {
   "Japanese (ja)":           ("今日は天気です。BudouXは機械学習を用いた改行整形ツールです。",
                               budoux.load_default_japanese_parser()),
   "Simplified Chinese":      ("今天是晴天。BudouX 是一个使用机器学习的换行整理工具。",
                               budoux.load_default_simplified_chinese_parser()),
   "Traditional Chinese":     ("今天是晴天。BudouX 是一個使用機器學習的換行整理工具。",
                               budoux.load_default_traditional_chinese_parser()),
   "Thai (th)":               ("วันนี้อากาศดีมากและฉันอยากออกไปเดินเล่นที่สวนสาธารณะ",
                               budoux.load_default_thai_parser()),
}
for name, (text, parser) in samples.items():
   chunks = parser.parse(text)
   print(f"\n• {name}")
   print(f"  raw   : {text}")
   print(f"  parsed: {' | '.join(chunks)}    ({len(chunks)} phrases)")