🎨 Tesseract OCR 语言包支持通过环境变量设置 Fix https://github.com/siyuan-note/siyuan/issues/7242

This commit is contained in:
Liang Ding 2023-02-03 18:36:33 +08:00
parent 3b0463029e
commit 09379324de
No known key found for this signature in database
GPG key ID: 136F30F901A2231D
4 changed files with 308 additions and 27 deletions

View file

@ -6,7 +6,7 @@
"id": "20200924100744-br924ar",
"title": "Assets",
"type": "doc",
"updated": "20230202231916"
"updated": "20230203183434"
},
"Children": [
{
@ -724,7 +724,7 @@
"ListData": {},
"Properties": {
"id": "20230202231731-bdh7lab",
"updated": "20230202231916"
"updated": "20230203183434"
},
"Children": [
{
@ -736,7 +736,7 @@
},
"Properties": {
"id": "20230202231732-n7z8jth",
"updated": "20230202231916"
"updated": "20230203183347"
},
"Children": [
{
@ -744,12 +744,21 @@
"Type": "NodeParagraph",
"Properties": {
"id": "20230202231732-f3jkj7p",
"updated": "20230202231916"
"updated": "20230203183347"
},
"Children": [
{
"Type": "NodeText",
"Data": "When installing Tesseract OCR, you need to check the language pack you need, and add the installed Tesseract-OCR directory path to the environment variable PATH, so that SiYuan can directly call the "
"Data": "When installing Tesseract OCR, you need to check the language pack you need, and add the installed Tesseract-OCR directory path to the environment variable "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "PATH"
},
{
"Type": "NodeText",
"Data": ", so that SiYuan can directly call the "
},
{
"Type": "NodeTextMark",
@ -782,7 +791,7 @@
},
"Properties": {
"id": "20230202231800-z8hswmk",
"updated": "20230202231819"
"updated": "20230203183434"
},
"Children": [
{
@ -790,12 +799,93 @@
"Type": "NodeParagraph",
"Properties": {
"id": "20230202231800-c3x45ky",
"updated": "20230202231819"
"updated": "20230203183434"
},
"Children": [
{
"Type": "NodeText",
"Data": "Do not install too many language packs, otherwise it will cause OCR to be slow or even timeout to return empty results, and take up too many system resources"
"Data": "SiYuan does not enable all installed language packs, because too many language packs will cause OCR to be slow or even timeout and return empty results, and take up too many system resources, so by default only "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "eng"
},
{
"Type": "NodeText",
"Data": ", "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "chi*"
},
{
"Type": "NodeText",
"Data": ", "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "fra"
},
{
"Type": "NodeText",
"Data": ", "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "spa"
},
{
"Type": "NodeText",
"Data": ", "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "deu"
},
{
"Type": "NodeText",
"Data": ", "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "rus"
},
{
"Type": "NodeText",
"Data": " and "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "osd"
},
{
"Type": "NodeText",
"Data": " will be enabled at most, if you need more accurate language support, you can specify the language pack by configuring the environment variable "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "SIYUAN_TESSERACT_LANGS"
},
{
"Type": "NodeText",
"Data": ", such as "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng "
},
{
"Type": "NodeText",
"Data": ""
}
]
}

View file

@ -6,7 +6,7 @@
"id": "20200915214115-42b8zma",
"title": "资源文件",
"type": "doc",
"updated": "20230202231842"
"updated": "20230203182839"
},
"Children": [
{
@ -750,7 +750,7 @@
"ListData": {},
"Properties": {
"id": "20230202231309-pcjl7c2",
"updated": "20230202231842"
"updated": "20230203182839"
},
"Children": [
{
@ -770,12 +770,21 @@
"Type": "NodeParagraph",
"Properties": {
"id": "20230202231311-n1pf7in",
"updated": "20230202231842"
"updated": "20230203182342"
},
"Children": [
{
"Type": "NodeText",
"Data": "在安装 Tesseract OCR 时需要勾选你需要的语言包,并将安装后的 Tesseract-OCR 目录路径添加到环境变量 PATH 中,这样思源才能直接调用 "
"Data": "在安装 Tesseract OCR 时需要勾选你需要的语言包,并将安装后的 Tesseract-OCR 目录路径添加到环境变量 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "PATH"
},
{
"Type": "NodeText",
"Data": " 中,这样思源才能直接调用 "
},
{
"Type": "NodeTextMark",
@ -808,7 +817,7 @@
},
"Properties": {
"id": "20230202231321-q1b1tza",
"updated": "20230202231443"
"updated": "20230203182839"
},
"Children": [
{
@ -816,12 +825,93 @@
"Type": "NodeParagraph",
"Properties": {
"id": "20230202231321-5ugmgf0",
"updated": "20230202231443"
"updated": "20230203182839"
},
"Children": [
{
"Type": "NodeText",
"Data": "语言包不要安装太多,否则会导致 OCR 缓慢甚至超时返回空结果,并且占用过多的系统资源"
"Data": "思源并不会启用所有已安装的语言包,因为语言包太多的话会导致 OCR 缓慢甚至超时返回空结果,并且占用过多的系统资源,所以默认最多只会启用 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "eng"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "chi*"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "fra"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "spa"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "deu"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "rus"
},
{
"Type": "NodeText",
"Data": " 和 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "osd"
},
{
"Type": "NodeText",
"Data": "​,如果你需要更准确的语言支持,可以通过配置环境变量 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "SIYUAN_TESSERACT_LANGS"
},
{
"Type": "NodeText",
"Data": " 来指定语言包,比如 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng"
},
{
"Type": "NodeText",
"Data": ""
}
]
}

View file

@ -5,7 +5,7 @@
"Properties": {
"id": "20211226123038-4umgpxy",
"title": "資料文件",
"updated": "20230202231927"
"updated": "20230203183310"
},
"Children": [
{
@ -729,7 +729,7 @@
"ListData": {},
"Properties": {
"id": "20230202231516-o6k9mj1",
"updated": "20230202231927"
"updated": "20230203183310"
},
"Children": [
{
@ -741,7 +741,7 @@
},
"Properties": {
"id": "20230202231516-pwj2ndg",
"updated": "20230202231927"
"updated": "20230203183210"
},
"Children": [
{
@ -749,12 +749,21 @@
"Type": "NodeParagraph",
"Properties": {
"id": "20230202231516-8trf08t",
"updated": "20230202231927"
"updated": "20230203183210"
},
"Children": [
{
"Type": "NodeText",
"Data": "在安裝 Tesseract OCR 時需要勾選你需要的語言包,並將安裝後的 Tesseract-OCR 目錄路徑添加到環境變量 PATH 中,這樣思源才能直接調用 "
"Data": "在安裝 Tesseract OCR 時需要勾選你需要的語言包,並將安裝後的 Tesseract-OCR 目錄路徑添加到環境變量 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "PATH"
},
{
"Type": "NodeText",
"Data": " 中,這樣思源才能直接調用 "
},
{
"Type": "NodeTextMark",
@ -787,7 +796,7 @@
},
"Properties": {
"id": "20230202231519-x47s7he",
"updated": "20230202231811"
"updated": "20230203183310"
},
"Children": [
{
@ -795,12 +804,93 @@
"Type": "NodeParagraph",
"Properties": {
"id": "20230202231519-04f6dh6",
"updated": "20230202231811"
"updated": "20230203183310"
},
"Children": [
{
"Type": "NodeText",
"Data": "語言包不要安裝太多,否則會導致 OCR 緩慢甚至超時返回空結果,並且佔用過多的系統資源"
"Data": "思源並不會啟用所有已安裝的語言包,因為語言包太多的話會導致 OCR 緩慢甚至超時返回空結果,並且佔用過多的系統資源,所以默認最多只會啟用 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "eng"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "chi*"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "fra"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "spa"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "deu"
},
{
"Type": "NodeText",
"Data": "​、"
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "rus"
},
{
"Type": "NodeText",
"Data": " 和 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "osd"
},
{
"Type": "NodeText",
"Data": "​,如果你需要更準確的語言支持,可以通過配置環境變量 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "SIYUAN_TESSERACT_LANGS"
},
{
"Type": "NodeText",
"Data": " 來指定語言包,比如 "
},
{
"Type": "NodeTextMark",
"TextMarkType": "code",
"TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng"
},
{
"Type": "NodeText",
"Data": ""
}
]
}

View file

@ -125,10 +125,21 @@ func initTesseract() {
func filterTesseractLangs(langs []string) (ret []string) {
ret = []string{}
for _, lang := range langs {
if "eng" == lang || strings.HasPrefix(lang, "chi") || "fra" == lang || "spa" == lang || "deu" == lang ||
"rus" == lang || "osd" == lang {
ret = append(ret, lang)
envLangsVal := os.Getenv("SIYUAN_TESSERACT_LANGS")
if "" != envLangsVal {
envLangs := strings.Split(envLangsVal, "+")
for _, lang := range langs {
if gulu.Str.Contains(lang, envLangs) {
ret = append(ret, lang)
}
}
} else {
for _, lang := range langs {
if "eng" == lang || strings.HasPrefix(lang, "chi") || "fra" == lang || "spa" == lang || "deu" == lang ||
"rus" == lang || "osd" == lang {
ret = append(ret, lang)
}
}
}
return ret