From 0beca364fa60a348ab073e0d03e34841fd92a56d Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sat, 26 Aug 2023 22:38:02 +0800 Subject: [PATCH 1/9] :art: Add kernel API `/api/archive/zip` and `/api/archive/unzip` Fix https://github.com/siyuan-note/siyuan/issues/9028 --- kernel/api/archive.go | 81 +++++++++++++++++++++++++++++++++++++++++++ kernel/api/router.go | 3 ++ 2 files changed, 84 insertions(+) create mode 100644 kernel/api/archive.go diff --git a/kernel/api/archive.go b/kernel/api/archive.go new file mode 100644 index 000000000..835262b3f --- /dev/null +++ b/kernel/api/archive.go @@ -0,0 +1,81 @@ +// SiYuan - Refactor your thinking +// Copyright (c) 2020-present, b3log.org +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package api + +import ( + "net/http" + "path/filepath" + + "github.com/88250/gulu" + "github.com/gin-gonic/gin" + "github.com/siyuan-note/siyuan/kernel/util" +) + +func zip(c *gin.Context) { + ret := gulu.Ret.NewResult() + defer c.JSON(http.StatusOK, ret) + + arg, ok := util.JsonArg(c, ret) + if !ok { + return + } + + path := arg["path"].(string) + zipPath := arg["zipPath"].(string) + zipFile, err := gulu.Zip.Create(zipPath) + if nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } + + base := filepath.Base(path) + if gulu.File.IsDir(path) { + err = zipFile.AddDirectory(base, path) + } else { + err = zipFile.AddEntry(base, path) + } + if nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } + + if err = zipFile.Close(); nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } +} + +func unzip(c *gin.Context) { + ret := gulu.Ret.NewResult() + defer c.JSON(http.StatusOK, ret) + + arg, ok := util.JsonArg(c, ret) + if !ok { + return + } + + zipPath := arg["zipPath"].(string) + path := arg["path"].(string) + if err := gulu.Zip.Unzip(zipPath, path); nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } +} diff --git a/kernel/api/router.go b/kernel/api/router.go index f2cd93dc6..0d1707762 100644 --- a/kernel/api/router.go +++ b/kernel/api/router.go @@ -377,4 +377,7 @@ func ServeAPI(ginServer *gin.Engine) { ginServer.Handle("GET", "/api/broadcast/channels", model.CheckAuth, getChannels) ginServer.Handle("POST", "/api/broadcast/postMessage", model.CheckAuth, postMessage) ginServer.Handle("POST", "/api/broadcast/getChannelInfo", model.CheckAuth, getChannelInfo) + + ginServer.Handle("POST", "/api/archive/zip", model.CheckAuth, zip) + ginServer.Handle("POST", "/api/archive/unzip", model.CheckAuth, unzip) } From f4e840fae6c46cab8a03c89249f6d59913005750 Mon Sep 17 00:00:00 2001 From: nekrondev Date: Sat, 26 Aug 2023 16:44:14 +0200 Subject: [PATCH 2/9] feat(assets): improve PDF asset parser performance (#9051) This commit will change the single-threaded behavior of PDF parser into multi-threaded worker pool speeding up PDF parsing into text Co-authored-by: Heiko Besemann --- kernel/model/asset_content.go | 161 ++++++++++++++++++++++++++-------- 1 file changed, 122 insertions(+), 39 deletions(-) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 9ee5f9a08..2ba193b8e 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -21,6 +21,7 @@ import ( "io/fs" "os" "path/filepath" + "runtime" "strconv" "strings" "sync" @@ -30,6 +31,7 @@ import ( "github.com/88250/gulu" "github.com/88250/lute/ast" "github.com/dustin/go-humanize" + "github.com/klippa-app/go-pdfium" "github.com/klippa-app/go-pdfium/requests" "github.com/klippa-app/go-pdfium/webassembly" "github.com/siyuan-note/eventbus" @@ -676,8 +678,70 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) { type PdfAssetParser struct { } -// Parse will parse a PDF document using PDFium webassembly module +// pdfPage struct defines a worker job for text extraction +type pdfPage struct { + pageNo int // page number for text extraction + data *[]byte // pointer to PDF document data +} + +// pdfTextResult struct defines the extracted PDF text result +type pdfTextResult struct { + pageNo int // page number of PDF document + text string // text of converted page + err error // processing error +} + +// getTextPageWorker will extract the text from a given PDF page and return its result +func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) { + defer instance.Close() + for pd := range page { + doc, err := instance.OpenDocument(&requests.OpenDocument{ + File: pd.data, + }) + if err != nil { + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + result <- &pdfTextResult{ + pageNo: pd.pageNo, + err: err, + } + continue + } + + req := &requests.GetPageText{ + Page: requests.Page{ + ByIndex: &requests.PageByIndex{ + Document: doc.Document, + Index: pd.pageNo, + }, + }, + } + res, err := instance.GetPageText(req) + if err != nil { + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + result <- &pdfTextResult{ + pageNo: pd.pageNo, + err: err, + } + continue + } + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + result <- &pdfTextResult{ + pageNo: pd.pageNo, + text: res.Text, + err: nil, + } + } +} + +// Parse will parse a PDF document using PDFium webassembly module using a worker pool func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { + st := time.Now() if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") { return } @@ -692,24 +756,20 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { } defer os.RemoveAll(tmp) - f, err := os.Open(tmp) - if nil != err { - logging.LogErrorf("open [%s] failed: [%s]", tmp, err) - return - } - defer f.Close() - - stat, err := f.Stat() + // PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible + pdfData, err := os.ReadFile(tmp) if nil != err { logging.LogErrorf("open [%s] failed: [%s]", tmp, err) return } - // initialize pdfium with one worker + // initialize go-pdfium with number of available cores + // we fire up the complete worker pool for maximum performance + cores := runtime.NumCPU() pool, err := webassembly.Init(webassembly.Config{ - MinIdle: 1, - MaxIdle: 1, - MaxTotal: 1, + MinIdle: cores, + MaxIdle: cores, + MaxTotal: cores, }) if err != nil { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) @@ -717,50 +777,73 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { } defer pool.Close() + // first get the number of PDF pages to convert into text instance, err := pool.GetInstance(time.Second * 30) if err != nil { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } - defer instance.Close() - - // get number of pages inside PDF document doc, err := instance.OpenDocument(&requests.OpenDocument{ - FileReader: f, - FileReaderSize: stat.Size(), + File: &pdfData, }) if err != nil { + instance.Close() logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } - defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ - Document: doc.Document, - }) + pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) + if err != nil { + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + instance.Close() + logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + return + } + instance.Close() - pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) - if err != nil { - logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) - return - } - // loop through pages and get content - content := "" - for page := 0; page < pageCount.PageCount; page++ { - req := &requests.GetPageText{ - Page: requests.Page{ - ByIndex: &requests.PageByIndex{ - Document: doc.Document, - Index: page, - }, - }, - } - pt, err := instance.GetPageText(req) + // next setup worker pool for processing PDF pages + pages := make(chan *pdfPage, pc.PageCount) + results := make(chan *pdfTextResult, pc.PageCount) + for i := 0; i < cores; i++ { + inst, err := pool.GetInstance(time.Second * 30) if err != nil { + close(pages) + close(results) logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } - content += " " + normalizeNonTxtAssetContent(pt.Text) + go parser.getTextPageWorker(i, inst, pages, results) } + // now split pages and let them process by worker pool + for p := 0; p < pc.PageCount; p++ { + pages <- &pdfPage{ + pageNo: p, + data: &pdfData, + } + } + close(pages) + + // finally fetch the PDF page text results + // Note: some workers will process pages faster than other workers depending on the page contents + // the order of returned PDF text pages is random and must be sorted using the pageNo index + pagetext := make([]string, pc.PageCount) + for p := 0; p < pc.PageCount; p++ { + res := <-results + pagetext[res.pageNo] = res.text + if res.err != nil { + logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err) + } + } + close(results) + logging.LogInfof("convert [%s] PDF with %d pages using %d workers took %s.\n", tmp, pc.PageCount, cores, time.Since(st)) + + // loop through ordered PDF text pages and join content for asset parse DB result + content := "" + for _, pt := range pagetext { + content += " " + normalizeNonTxtAssetContent(pt) + } ret = &AssetParseResult{ Content: content, } From 4baeeed1ecb7454e532fb5f4be72fca855b29585 Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sun, 27 Aug 2023 10:53:48 +0800 Subject: [PATCH 3/9] :art: Non-UTF-8 encoded text files are not included in asset file content searching Fix https://github.com/siyuan-note/siyuan/issues/9052 --- kernel/model/asset_content.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 2ba193b8e..c5ac3c7d9 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -26,6 +26,7 @@ import ( "strings" "sync" "time" + "unicode/utf8" "code.sajari.com/docconv" "github.com/88250/gulu" @@ -516,6 +517,11 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { return } + if !utf8.Valid(data) { + // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052 + return + } + content := string(data) ret = &AssetParseResult{ Content: content, From fd32668abc29350046fa521df777795ff0431052 Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sun, 27 Aug 2023 11:02:36 +0800 Subject: [PATCH 4/9] :art: PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 --- kernel/model/asset_content.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index c5ac3c7d9..ff5c150ac 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -519,6 +519,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { if !utf8.Valid(data) { // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052 + logging.LogWarnf("asset [%s] is not UTF-8 encoded", absPath) return } @@ -808,6 +809,12 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { } instance.Close() + if 1024 < pc.PageCount { + // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 + logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount) + return + } + // next setup worker pool for processing PDF pages pages := make(chan *pdfPage, pc.PageCount) results := make(chan *pdfTextResult, pc.PageCount) From b578506ea5e0cc70093b78a077e612816217b632 Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sun, 27 Aug 2023 11:09:19 +0800 Subject: [PATCH 5/9] :art: PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 --- kernel/model/asset_content.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index ff5c150ac..426d94d53 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -748,7 +748,7 @@ func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, // Parse will parse a PDF document using PDFium webassembly module using a worker pool func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { - st := time.Now() + now := time.Now() if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") { return } @@ -850,7 +850,10 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { } } close(results) - logging.LogInfof("convert [%s] PDF with %d pages using %d workers took %s.\n", tmp, pc.PageCount, cores, time.Since(st)) + + if 256 < pc.PageCount { + logging.LogInfof("convert [%s] PDF with [%d[ pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now)) + } // loop through ordered PDF text pages and join content for asset parse DB result content := "" From 9cfcec231004defc8bd1e3608c5b206fc55a2674 Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sun, 27 Aug 2023 11:09:37 +0800 Subject: [PATCH 6/9] :art: PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 --- kernel/model/asset_content.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 426d94d53..374eec0c5 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -705,7 +705,7 @@ func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, doc, err := instance.OpenDocument(&requests.OpenDocument{ File: pd.data, }) - if err != nil { + if nil != err { instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) @@ -725,7 +725,7 @@ func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, }, } res, err := instance.GetPageText(req) - if err != nil { + if nil != err { instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) @@ -778,7 +778,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { MaxIdle: cores, MaxTotal: cores, }) - if err != nil { + if nil != err { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } @@ -786,20 +786,20 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { // first get the number of PDF pages to convert into text instance, err := pool.GetInstance(time.Second * 30) - if err != nil { + if nil != err { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } doc, err := instance.OpenDocument(&requests.OpenDocument{ File: &pdfData, }) - if err != nil { + if nil != err { instance.Close() logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) - if err != nil { + if nil != err { instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ Document: doc.Document, }) @@ -820,7 +820,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { results := make(chan *pdfTextResult, pc.PageCount) for i := 0; i < cores; i++ { inst, err := pool.GetInstance(time.Second * 30) - if err != nil { + if nil != err { close(pages) close(results) logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) @@ -845,7 +845,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { for p := 0; p < pc.PageCount; p++ { res := <-results pagetext[res.pageNo] = res.text - if res.err != nil { + if res.nil != err { logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err) } } From 184b4aa0746ef4ed8d411df5b2ddc08748225aa5 Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sun, 27 Aug 2023 11:12:11 +0800 Subject: [PATCH 7/9] :art: PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 --- kernel/model/asset_content.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 374eec0c5..34b80ca71 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -845,7 +845,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { for p := 0; p < pc.PageCount; p++ { res := <-results pagetext[res.pageNo] = res.text - if res.nil != err { + if nil != res.err { logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err) } } From 7d6c101813a7c0de2c59214cbf7b78a13461f224 Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sun, 27 Aug 2023 11:13:39 +0800 Subject: [PATCH 8/9] :art: PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 --- kernel/model/asset_content.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 34b80ca71..49ec49bdc 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -851,7 +851,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { } close(results) - if 256 < pc.PageCount { + if 128 < pc.PageCount { logging.LogInfof("convert [%s] PDF with [%d[ pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now)) } From d3fa2bc5470e299f0e3bf716615b0409ce0e578f Mon Sep 17 00:00:00 2001 From: Daniel <845765@qq.com> Date: Sun, 27 Aug 2023 11:17:53 +0800 Subject: [PATCH 9/9] :art: PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 --- kernel/model/asset_content.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 49ec49bdc..cdb8fd09a 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -477,6 +477,7 @@ func NewAssetsSearcher() *AssetsSearcher { const ( TxtAssetContentMaxSize = 1024 * 1024 * 4 + PDFAssetContentMaxPage = 1024 ) type AssetParseResult struct { @@ -501,7 +502,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { } if TxtAssetContentMaxSize < info.Size() { - logging.LogWarnf("file [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size()))) + logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size()))) return } @@ -519,7 +520,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { if !utf8.Valid(data) { // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052 - logging.LogWarnf("asset [%s] is not UTF-8 encoded", absPath) + logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath) return } @@ -809,7 +810,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { } instance.Close() - if 1024 < pc.PageCount { + if PDFAssetContentMaxPage < pc.PageCount { // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount) return