From 09379324de63786ed332743dc93d17bc1fbbe3d2 Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Fri, 3 Feb 2023 18:36:33 +0800 Subject: [PATCH] =?UTF-8?q?:art:=20Tesseract=20OCR=20=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E5=8C=85=E6=94=AF=E6=8C=81=E9=80=9A=E8=BF=87=E7=8E=AF=E5=A2=83?= =?UTF-8?q?=E5=8F=98=E9=87=8F=E8=AE=BE=E7=BD=AE=20Fix=20https://github.com?= =?UTF-8?q?/siyuan-note/siyuan/issues/7242?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../20200924100744-br924ar.sy | 106 ++++++++++++++++-- .../20200915214115-42b8zma.sy | 104 +++++++++++++++-- .../20211226123038-4umgpxy.sy | 106 ++++++++++++++++-- kernel/util/tesseract.go | 19 +++- 4 files changed, 308 insertions(+), 27 deletions(-) diff --git a/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy b/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy index 0ff75eadc..17a9fbf49 100644 --- a/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy +++ b/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy @@ -6,7 +6,7 @@ "id": "20200924100744-br924ar", "title": "Assets", "type": "doc", - "updated": "20230202231916" + "updated": "20230203183434" }, "Children": [ { @@ -724,7 +724,7 @@ "ListData": {}, "Properties": { "id": "20230202231731-bdh7lab", - "updated": "20230202231916" + "updated": "20230203183434" }, "Children": [ { @@ -736,7 +736,7 @@ }, "Properties": { "id": "20230202231732-n7z8jth", - "updated": "20230202231916" + "updated": "20230203183347" }, "Children": [ { @@ -744,12 +744,21 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231732-f3jkj7p", - "updated": "20230202231916" + "updated": "20230203183347" }, "Children": [ { "Type": "NodeText", - "Data": "When installing Tesseract OCR, you need to check the language pack you need, and add the installed Tesseract-OCR directory path to the environment variable PATH, so that SiYuan can directly call the " + "Data": "When installing Tesseract OCR, you need to check the language pack you need, and add the installed Tesseract-OCR directory path to the environment variable " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "PATH" + }, + { + "Type": "NodeText", + "Data": "​, so that SiYuan can directly call the " }, { "Type": "NodeTextMark", @@ -782,7 +791,7 @@ }, "Properties": { "id": "20230202231800-z8hswmk", - "updated": "20230202231819" + "updated": "20230203183434" }, "Children": [ { @@ -790,12 +799,93 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231800-c3x45ky", - "updated": "20230202231819" + "updated": "20230203183434" }, "Children": [ { "Type": "NodeText", - "Data": "Do not install too many language packs, otherwise it will cause OCR to be slow or even timeout to return empty results, and take up too many system resources" + "Data": "SiYuan does not enable all installed language packs, because too many language packs will cause OCR to be slow or even timeout and return empty results, and take up too many system resources, so by default only " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "eng" + }, + { + "Type": "NodeText", + "Data": "​, " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "chi*" + }, + { + "Type": "NodeText", + "Data": "​, " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "fra" + }, + { + "Type": "NodeText", + "Data": "​, " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "spa" + }, + { + "Type": "NodeText", + "Data": "​, " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "deu" + }, + { + "Type": "NodeText", + "Data": "​, " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "rus" + }, + { + "Type": "NodeText", + "Data": "​ and " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "osd" + }, + { + "Type": "NodeText", + "Data": "​ will be enabled at most, if you need more accurate language support, you can specify the language pack by configuring the environment variable " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS" + }, + { + "Type": "NodeText", + "Data": "​, such as " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng " + }, + { + "Type": "NodeText", + "Data": "​" } ] } diff --git a/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy b/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy index 045e524f0..63e7d0a2c 100644 --- a/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy +++ b/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy @@ -6,7 +6,7 @@ "id": "20200915214115-42b8zma", "title": "资源文件", "type": "doc", - "updated": "20230202231842" + "updated": "20230203182839" }, "Children": [ { @@ -750,7 +750,7 @@ "ListData": {}, "Properties": { "id": "20230202231309-pcjl7c2", - "updated": "20230202231842" + "updated": "20230203182839" }, "Children": [ { @@ -770,12 +770,21 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231311-n1pf7in", - "updated": "20230202231842" + "updated": "20230203182342" }, "Children": [ { "Type": "NodeText", - "Data": "在安装 Tesseract OCR 时需要勾选你需要的语言包,并将安装后的 Tesseract-OCR 目录路径添加到环境变量 PATH 中,这样思源才能直接调用 " + "Data": "在安装 Tesseract OCR 时需要勾选你需要的语言包,并将安装后的 Tesseract-OCR 目录路径添加到环境变量 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "PATH" + }, + { + "Type": "NodeText", + "Data": "​ 中,这样思源才能直接调用 " }, { "Type": "NodeTextMark", @@ -808,7 +817,7 @@ }, "Properties": { "id": "20230202231321-q1b1tza", - "updated": "20230202231443" + "updated": "20230203182839" }, "Children": [ { @@ -816,12 +825,93 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231321-5ugmgf0", - "updated": "20230202231443" + "updated": "20230203182839" }, "Children": [ { "Type": "NodeText", - "Data": "语言包不要安装太多,否则会导致 OCR 缓慢甚至超时返回空结果,并且占用过多的系统资源" + "Data": "思源并不会启用所有已安装的语言包,因为语言包太多的话会导致 OCR 缓慢甚至超时返回空结果,并且占用过多的系统资源,所以默认最多只会启用 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "eng" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "chi*" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "fra" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "spa" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "deu" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "rus" + }, + { + "Type": "NodeText", + "Data": "​ 和 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "osd" + }, + { + "Type": "NodeText", + "Data": "​,如果你需要更准确的语言支持,可以通过配置环境变量 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS" + }, + { + "Type": "NodeText", + "Data": "​ 来指定语言包,比如 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng" + }, + { + "Type": "NodeText", + "Data": "​" } ] } diff --git a/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy b/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy index 4243b7b88..348afb926 100644 --- a/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy +++ b/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy @@ -5,7 +5,7 @@ "Properties": { "id": "20211226123038-4umgpxy", "title": "資料文件", - "updated": "20230202231927" + "updated": "20230203183310" }, "Children": [ { @@ -729,7 +729,7 @@ "ListData": {}, "Properties": { "id": "20230202231516-o6k9mj1", - "updated": "20230202231927" + "updated": "20230203183310" }, "Children": [ { @@ -741,7 +741,7 @@ }, "Properties": { "id": "20230202231516-pwj2ndg", - "updated": "20230202231927" + "updated": "20230203183210" }, "Children": [ { @@ -749,12 +749,21 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231516-8trf08t", - "updated": "20230202231927" + "updated": "20230203183210" }, "Children": [ { "Type": "NodeText", - "Data": "在安裝 Tesseract OCR 時需要勾選你需要的語言包,並將安裝後的 Tesseract-OCR 目錄路徑添加到環境變量 PATH 中,這樣思源才能直接調用 " + "Data": "在安裝 Tesseract OCR 時需要勾選你需要的語言包,並將安裝後的 Tesseract-OCR 目錄路徑添加到環境變量 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "PATH" + }, + { + "Type": "NodeText", + "Data": "​ 中,這樣思源才能直接調用 " }, { "Type": "NodeTextMark", @@ -787,7 +796,7 @@ }, "Properties": { "id": "20230202231519-x47s7he", - "updated": "20230202231811" + "updated": "20230203183310" }, "Children": [ { @@ -795,12 +804,93 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231519-04f6dh6", - "updated": "20230202231811" + "updated": "20230203183310" }, "Children": [ { "Type": "NodeText", - "Data": "語言包不要安裝太多,否則會導致 OCR 緩慢甚至超時返回空結果,並且佔用過多的系統資源" + "Data": "思源並不會啟用所有已安裝的語言包,因為語言包太多的話會導致 OCR 緩慢甚至超時返回空結果,並且佔用過多的系統資源,所以默認最多只會啟用 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "eng" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "chi*" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "fra" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "spa" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "deu" + }, + { + "Type": "NodeText", + "Data": "​、" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "rus" + }, + { + "Type": "NodeText", + "Data": "​ 和 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "osd" + }, + { + "Type": "NodeText", + "Data": "​,如果你需要更準確的語言支持,可以通過配置環境變量 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS" + }, + { + "Type": "NodeText", + "Data": "​ 來指定語言包,比如 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng" + }, + { + "Type": "NodeText", + "Data": "​" } ] } diff --git a/kernel/util/tesseract.go b/kernel/util/tesseract.go index 32057f909..be6a1132b 100644 --- a/kernel/util/tesseract.go +++ b/kernel/util/tesseract.go @@ -125,10 +125,21 @@ func initTesseract() { func filterTesseractLangs(langs []string) (ret []string) { ret = []string{} - for _, lang := range langs { - if "eng" == lang || strings.HasPrefix(lang, "chi") || "fra" == lang || "spa" == lang || "deu" == lang || - "rus" == lang || "osd" == lang { - ret = append(ret, lang) + + envLangsVal := os.Getenv("SIYUAN_TESSERACT_LANGS") + if "" != envLangsVal { + envLangs := strings.Split(envLangsVal, "+") + for _, lang := range langs { + if gulu.Str.Contains(lang, envLangs) { + ret = append(ret, lang) + } + } + } else { + for _, lang := range langs { + if "eng" == lang || strings.HasPrefix(lang, "chi") || "fra" == lang || "spa" == lang || "deu" == lang || + "rus" == lang || "osd" == lang { + ret = append(ret, lang) + } } } return ret