From 3dbc6d91edce6a7b04aa24574209af190ecf7b32 Mon Sep 17 00:00:00 2001 From: Liang Ding Date: Sat, 11 Feb 2023 10:33:53 +0800 Subject: [PATCH] =?UTF-8?q?:art:=20=E5=A4=A7=E4=BA=8E=202MB=20=E7=9A=84?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E9=BB=98=E8=AE=A4=E4=B8=8D=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=20OCR=20Fix=20https://github.com/siyuan-note/siyuan/issues/733?= =?UTF-8?q?3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../20210808180117-6v0mkxr/.siyuan/conf.json | 5 +- .../20200924100744-br924ar.sy | 87 +++++++++++++++++-- .../20210808180117-czj9bvb/.siyuan/conf.json | 5 +- .../20200915214115-42b8zma.sy | 86 ++++++++++++++++-- .../20211226090932-5lcq56f/.siyuan/conf.json | 5 +- .../20211226123038-4umgpxy.sy | 85 ++++++++++++++++-- kernel/util/tesseract.go | 16 +++- 7 files changed, 263 insertions(+), 26 deletions(-) diff --git a/app/guide/20210808180117-6v0mkxr/.siyuan/conf.json b/app/guide/20210808180117-6v0mkxr/.siyuan/conf.json index 3618180e8..a4d69e287 100644 --- a/app/guide/20210808180117-6v0mkxr/.siyuan/conf.json +++ b/app/guide/20210808180117-6v0mkxr/.siyuan/conf.json @@ -4,7 +4,8 @@ "icon": "1f4d4", "closed": false, "refCreateSavePath": "", - "createDocNameTemplate": "", + "docCreateSavePath": "", "dailyNoteSavePath": "/daily note/{{now | date \"2006/01\"}}/{{now | date \"2006-01-02\"}}", - "dailyNoteTemplatePath": "" + "dailyNoteTemplatePath": "", + "sortMode": 15 } \ No newline at end of file diff --git a/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy b/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy index 17a9fbf49..77b1a1927 100644 --- a/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy +++ b/app/guide/20210808180117-6v0mkxr/20200923234011-ieuun1p/20210808180303-xaduj2o/20200924100744-br924ar.sy @@ -6,7 +6,7 @@ "id": "20200924100744-br924ar", "title": "Assets", "type": "doc", - "updated": "20230203183434" + "updated": "20230211103249" }, "Children": [ { @@ -724,7 +724,7 @@ "ListData": {}, "Properties": { "id": "20230202231731-bdh7lab", - "updated": "20230203183434" + "updated": "20230211103249" }, "Children": [ { @@ -736,7 +736,7 @@ }, "Properties": { "id": "20230202231732-n7z8jth", - "updated": "20230203183347" + "updated": "20230211103249" }, "Children": [ { @@ -744,7 +744,7 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231732-f3jkj7p", - "updated": "20230203183347" + "updated": "20230211103249" }, "Children": [ { @@ -791,7 +791,7 @@ }, "Properties": { "id": "20230202231800-z8hswmk", - "updated": "20230203183434" + "updated": "20230211103154" }, "Children": [ { @@ -799,7 +799,7 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231800-c3x45ky", - "updated": "20230203183434" + "updated": "20230211103154" }, "Children": [ { @@ -881,7 +881,7 @@ { "Type": "NodeTextMark", "TextMarkType": "code", - "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng " + "TextMarkTextContent": "SIYUAN_TESSERACT_LANGS=chi_sim+eng" }, { "Type": "NodeText", @@ -890,6 +890,79 @@ ] } ] + }, + { + "ID": "20230211102830-9azqf9m", + "Type": "NodeListItem", + "ListData": { + "BulletChar": 42, + "Marker": "Kg==" + }, + "Properties": { + "id": "20230211102830-9azqf9m" + }, + "Children": [ + { + "ID": "20230211102830-sbchex4", + "Type": "NodeParagraph", + "Properties": { + "id": "20230211102830-sbchex4", + "updated": "20230211102832" + }, + "Children": [ + { + "Type": "NodeText", + "Data": "Only images in png and jpg formats are supported" + } + ] + } + ] + }, + { + "ID": "20230211102834-fx3o5su", + "Type": "NodeListItem", + "ListData": { + "BulletChar": 42, + "Marker": "Kg==" + }, + "Properties": { + "id": "20230211102834-fx3o5su", + "updated": "20230211102928" + }, + "Children": [ + { + "ID": "20230211102834-3jzjdrv", + "Type": "NodeParagraph", + "Properties": { + "id": "20230211102834-3jzjdrv", + "updated": "20230211102928" + }, + "Children": [ + { + "Type": "NodeText", + "Data": "By default, only images below 2MB are processed. If you need to adjust, you can set the environment variable " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_MAX_SIZE" + }, + { + "Type": "NodeText", + "Data": "​, the unit of value is bytes, for example: " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_MAX_SIZE=4000000" + }, + { + "Type": "NodeText", + "Data": "​ adjust the upper limit to 4MB" + } + ] + } + ] } ] }, diff --git a/app/guide/20210808180117-czj9bvb/.siyuan/conf.json b/app/guide/20210808180117-czj9bvb/.siyuan/conf.json index 22ec4786e..27a4c7ae5 100644 --- a/app/guide/20210808180117-czj9bvb/.siyuan/conf.json +++ b/app/guide/20210808180117-czj9bvb/.siyuan/conf.json @@ -4,7 +4,8 @@ "icon": "1f4d4", "closed": false, "refCreateSavePath": "", - "createDocNameTemplate": "", + "docCreateSavePath": "", "dailyNoteSavePath": "/daily note/{{now | date \"2006/01\"}}/{{now | date \"2006-01-02\"}}", - "dailyNoteTemplatePath": "" + "dailyNoteTemplatePath": "", + "sortMode": 15 } \ No newline at end of file diff --git a/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy b/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy index 63e7d0a2c..3a56e1123 100644 --- a/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy +++ b/app/guide/20210808180117-czj9bvb/20200812220555-lj3enxa/20210808180321-hbvl5c2/20200915214115-42b8zma.sy @@ -6,7 +6,7 @@ "id": "20200915214115-42b8zma", "title": "资源文件", "type": "doc", - "updated": "20230203182839" + "updated": "20230211103308" }, "Children": [ { @@ -750,7 +750,7 @@ "ListData": {}, "Properties": { "id": "20230202231309-pcjl7c2", - "updated": "20230203182839" + "updated": "20230211103308" }, "Children": [ { @@ -762,7 +762,7 @@ }, "Properties": { "id": "20230202231311-7qdk1za", - "updated": "20230202231842" + "updated": "20230211103308" }, "Children": [ { @@ -770,7 +770,7 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231311-n1pf7in", - "updated": "20230203182342" + "updated": "20230211103308" }, "Children": [ { @@ -817,7 +817,7 @@ }, "Properties": { "id": "20230202231321-q1b1tza", - "updated": "20230203182839" + "updated": "20230211103207" }, "Children": [ { @@ -825,7 +825,7 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231321-5ugmgf0", - "updated": "20230203182839" + "updated": "20230211103207" }, "Children": [ { @@ -916,6 +916,80 @@ ] } ] + }, + { + "ID": "20230211102440-0qik4dd", + "Type": "NodeListItem", + "ListData": { + "BulletChar": 42, + "Marker": "Kg==" + }, + "Properties": { + "id": "20230211102440-0qik4dd", + "updated": "20230211102642" + }, + "Children": [ + { + "ID": "20230211102440-09cmf75", + "Type": "NodeParagraph", + "Properties": { + "id": "20230211102440-09cmf75", + "updated": "20230211102642" + }, + "Children": [ + { + "Type": "NodeText", + "Data": "仅支持 png 和 jpg 格式的图片" + } + ] + } + ] + }, + { + "ID": "20230211102601-ifl3ojm", + "Type": "NodeListItem", + "ListData": { + "BulletChar": 42, + "Marker": "Kg==" + }, + "Properties": { + "id": "20230211102601-ifl3ojm", + "updated": "20230211102744" + }, + "Children": [ + { + "ID": "20230211102601-npe6hvh", + "Type": "NodeParagraph", + "Properties": { + "id": "20230211102601-npe6hvh", + "updated": "20230211102744" + }, + "Children": [ + { + "Type": "NodeText", + "Data": "默认只对 2MB 以下的图片进行处理,如果需要调整,可以通过环境变量 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_MAX_SIZE" + }, + { + "Type": "NodeText", + "Data": "​ 设置,值的单位是字节,比如:" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_MAX_SIZE=4000000" + }, + { + "Type": "NodeText", + "Data": "​ 将上限调整为 4MB" + } + ] + } + ] } ] }, diff --git a/app/guide/20211226090932-5lcq56f/.siyuan/conf.json b/app/guide/20211226090932-5lcq56f/.siyuan/conf.json index 8d20183a6..c3870b023 100644 --- a/app/guide/20211226090932-5lcq56f/.siyuan/conf.json +++ b/app/guide/20211226090932-5lcq56f/.siyuan/conf.json @@ -4,7 +4,8 @@ "icon": "1f4d4", "closed": false, "refCreateSavePath": "", - "createDocNameTemplate": "", + "docCreateSavePath": "", "dailyNoteSavePath": "/daily note/{{now | date \"2006/01\"}}/{{now | date \"2006-01-02\"}}", - "dailyNoteTemplatePath": "" + "dailyNoteTemplatePath": "", + "sortMode": 15 } \ No newline at end of file diff --git a/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy b/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy index 348afb926..6d64cf756 100644 --- a/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy +++ b/app/guide/20211226090932-5lcq56f/20211226115423-d5z1joq/20211226121203-rjjngpz/20211226123038-4umgpxy.sy @@ -5,7 +5,7 @@ "Properties": { "id": "20211226123038-4umgpxy", "title": "資料文件", - "updated": "20230203183310" + "updated": "20230211103259" }, "Children": [ { @@ -729,7 +729,7 @@ "ListData": {}, "Properties": { "id": "20230202231516-o6k9mj1", - "updated": "20230203183310" + "updated": "20230211103259" }, "Children": [ { @@ -741,7 +741,7 @@ }, "Properties": { "id": "20230202231516-pwj2ndg", - "updated": "20230203183210" + "updated": "20230211103259" }, "Children": [ { @@ -749,7 +749,7 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231516-8trf08t", - "updated": "20230203183210" + "updated": "20230211103259" }, "Children": [ { @@ -796,7 +796,7 @@ }, "Properties": { "id": "20230202231519-x47s7he", - "updated": "20230203183310" + "updated": "20230211103203" }, "Children": [ { @@ -804,7 +804,7 @@ "Type": "NodeParagraph", "Properties": { "id": "20230202231519-04f6dh6", - "updated": "20230203183310" + "updated": "20230211103203" }, "Children": [ { @@ -895,6 +895,79 @@ ] } ] + }, + { + "ID": "20230211102853-w8ykvqx", + "Type": "NodeListItem", + "ListData": { + "BulletChar": 42, + "Marker": "Kg==" + }, + "Properties": { + "id": "20230211102853-w8ykvqx" + }, + "Children": [ + { + "ID": "20230211102853-4lc4az0", + "Type": "NodeParagraph", + "Properties": { + "id": "20230211102853-4lc4az0", + "updated": "20230211102856" + }, + "Children": [ + { + "Type": "NodeText", + "Data": "僅支持 png 和 jpg 格式的圖片" + } + ] + } + ] + }, + { + "ID": "20230211102858-0lgz6pc", + "Type": "NodeListItem", + "ListData": { + "BulletChar": 42, + "Marker": "Kg==" + }, + "Properties": { + "id": "20230211102858-0lgz6pc", + "updated": "20230211102910" + }, + "Children": [ + { + "ID": "20230211102858-mfhvy7x", + "Type": "NodeParagraph", + "Properties": { + "id": "20230211102858-mfhvy7x", + "updated": "20230211102910" + }, + "Children": [ + { + "Type": "NodeText", + "Data": "默認只對 2MB 以下的圖片進行處理,如果需要調整,可以通過環境變量 " + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_MAX_SIZE" + }, + { + "Type": "NodeText", + "Data": "​ 設置,值的單位是字節,比如:" + }, + { + "Type": "NodeTextMark", + "TextMarkType": "code", + "TextMarkTextContent": "SIYUAN_TESSERACT_MAX_SIZE=4000000" + }, + { + "Type": "NodeText", + "Data": "​ 將上限調整為 4MB" + } + ] + } + ] } ] }, diff --git a/kernel/util/tesseract.go b/kernel/util/tesseract.go index 05969fb3b..abbe5d47c 100644 --- a/kernel/util/tesseract.go +++ b/kernel/util/tesseract.go @@ -23,17 +23,20 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "strings" "sync" "time" "github.com/88250/gulu" + "github.com/dustin/go-humanize" "github.com/siyuan-note/logging" ) var ( TesseractBin = "tesseract" TesseractEnabled bool + TesseractMaxSize = 2 * 1000 * uint64(1000) AssetsTexts = map[string]string{} AssetsTextsLock = sync.Mutex{} AssetsTextsChanged = false @@ -85,6 +88,10 @@ func Tesseract(imgAbsPath string) string { return "" } + if TesseractMaxSize < uint64(info.Size()) { + return "" + } + defer logging.Recover() ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second) @@ -124,8 +131,15 @@ func initTesseract() { return } + maxSizeVal := os.Getenv("SIYUAN_TESSERACT_MAX_SIZE") + if "" != maxSizeVal { + if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr { + TesseractMaxSize = maxSize + } + } + TesseractLangs = filterTesseractLangs(langs) - logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(TesseractLangs, "+")) + logging.LogInfof("tesseract-ocr enabled [ver=%s, maxSize=%s, langs=%s]", ver, humanize.Bytes(TesseractMaxSize), strings.Join(TesseractLangs, "+")) } func filterTesseractLangs(langs []string) (ret []string) {