以下の通りです
sudachipy.exe コマンドを呼ぶのではなく、 https://github.com/WorksApplications/SudachiPy/blob/develop/sudachipy/dictionarylib/userdictionarybuilder.py にある UserDictionaryBuilder クラスを 内部的に呼びたかったのですが、userdictionarybuilder.py は pip install sudachipy によるインストールの対象外のようでしたので、 以下のようにしています
import csv import datetime import os import re import sys import subprocess import sudachipy import unicodedata use_dic_dir = os.path.dirname(os.path.abspath(__file__)) dic_src_path = use_dic_dir + "/user.dic.src.txt" dic_csv_path = use_dic_dir + "/user.dic.csv" user_dic_path= use_dic_dir + "/user.dic" sudachi_cmd = "C:/Users/end0t/miniconda3/Scripts/sudachipy.exe" sys_dic_path = "C:/Users/end0t/miniconda3/Lib/site-packages/sudachidict_core/resources/system.dic" def main(): dic_words = load_dic_src( dic_src_path ) dic_csv_path = save_dic_csv( dic_words ) # 古いユーザ辞書fileのbackup global user_dic_path if os.path.exists( user_dic_path ): bakup_path = user_dic_path + "." + datetime.datetime.now().strftime('%Y%m%d') os.rename(user_dic_path, bakup_path) user_dic_path = make_user_dic( dic_csv_path ) print( user_dic_path ) # sudachipy.exe ubuild コマンドによるユーザ辞書の作成 def make_user_dic( dic_csv_path ): cmd_line = "{} ubuild -s {} -o {} {}".format( sudachi_cmd, sys_dic_path, user_dic_path, dic_csv_path ) print( cmd_line ) proc = subprocess.Popen( cmd_line, shell = True, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) stdout, stderr = proc.communicate() return user_dic_path # ユーザ辞書用csvの作成 https://qiita.com/sakamoto_mi/items/c1787973dd1a591c9957 # https://github.com/WorksApplications/Sudachi/blob/develop/docs/user_dict.md def save_dic_csv( dic_words ): dic_csv_tmpl = \ "{word},4789,4789,5000,{word},名詞,普通名詞,一般,*,*,*,*,{caption},*,*,*,*,*" with open(dic_csv_path, mode="w",encoding='utf-8') as f: for word, caption in dic_words.items(): csv_line = dic_csv_tmpl.format( word=word, caption=caption ) f.write( csv_line +"\n" ) return dic_csv_path # ユーザ辞書用csvの元となるtsvのload 縦軸:見出し語、横軸:類似語 def load_dic_src( dic_src_path ): ret_datas = {} with open(dic_src_path, encoding='utf-8') as f: for tsv_line in f: words = tsv_line.strip().split("\t") caption = None for i, word in enumerate(words): word = normalize_word( word ) if word in ret_datas: print( f"WARN duplicate word exist : {word}",file=sys.stderr ) continue if i == 0: caption = word if not caption: continue ret_datas[word] = caption return ret_datas # Sudachiのユーザー辞書には文字正規化が必要 # https://zenn.dev/sorami/articles/6bdb4bf6c7f207 def normalize_word( word ): word = re.sub("[\s\n ]+","",word) word = unicodedata.normalize('NFKC', word) word = word.lower().replace(",","") return word if __name__ == '__main__': main()