diff --git a/kernel/api/archive.go b/kernel/api/archive.go new file mode 100644 index 000000000..835262b3f --- /dev/null +++ b/kernel/api/archive.go @@ -0,0 +1,81 @@ +// SiYuan - Refactor your thinking +// Copyright (c) 2020-present, b3log.org +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package api + +import ( + "net/http" + "path/filepath" + + "github.com/88250/gulu" + "github.com/gin-gonic/gin" + "github.com/siyuan-note/siyuan/kernel/util" +) + +func zip(c *gin.Context) { + ret := gulu.Ret.NewResult() + defer c.JSON(http.StatusOK, ret) + + arg, ok := util.JsonArg(c, ret) + if !ok { + return + } + + path := arg["path"].(string) + zipPath := arg["zipPath"].(string) + zipFile, err := gulu.Zip.Create(zipPath) + if nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } + + base := filepath.Base(path) + if gulu.File.IsDir(path) { + err = zipFile.AddDirectory(base, path) + } else { + err = zipFile.AddEntry(base, path) + } + if nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } + + if err = zipFile.Close(); nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } +} + +func unzip(c *gin.Context) { + ret := gulu.Ret.NewResult() + defer c.JSON(http.StatusOK, ret) + + arg, ok := util.JsonArg(c, ret) + if !ok { + return + } + + zipPath := arg["zipPath"].(string) + path := arg["path"].(string) + if err := gulu.Zip.Unzip(zipPath, path); nil != err { + ret.Code = -1 + ret.Msg = err.Error() + return + } +} diff --git a/kernel/api/router.go b/kernel/api/router.go index f2cd93dc6..0d1707762 100644 --- a/kernel/api/router.go +++ b/kernel/api/router.go @@ -377,4 +377,7 @@ func ServeAPI(ginServer *gin.Engine) { ginServer.Handle("GET", "/api/broadcast/channels", model.CheckAuth, getChannels) ginServer.Handle("POST", "/api/broadcast/postMessage", model.CheckAuth, postMessage) ginServer.Handle("POST", "/api/broadcast/getChannelInfo", model.CheckAuth, getChannelInfo) + + ginServer.Handle("POST", "/api/archive/zip", model.CheckAuth, zip) + ginServer.Handle("POST", "/api/archive/unzip", model.CheckAuth, unzip) } diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 9ee5f9a08..cdb8fd09a 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -21,15 +21,18 @@ import ( "io/fs" "os" "path/filepath" + "runtime" "strconv" "strings" "sync" "time" + "unicode/utf8" "code.sajari.com/docconv" "github.com/88250/gulu" "github.com/88250/lute/ast" "github.com/dustin/go-humanize" + "github.com/klippa-app/go-pdfium" "github.com/klippa-app/go-pdfium/requests" "github.com/klippa-app/go-pdfium/webassembly" "github.com/siyuan-note/eventbus" @@ -474,6 +477,7 @@ func NewAssetsSearcher() *AssetsSearcher { const ( TxtAssetContentMaxSize = 1024 * 1024 * 4 + PDFAssetContentMaxPage = 1024 ) type AssetParseResult struct { @@ -498,7 +502,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { } if TxtAssetContentMaxSize < info.Size() { - logging.LogWarnf("file [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size()))) + logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size()))) return } @@ -514,6 +518,12 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) { return } + if !utf8.Valid(data) { + // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052 + logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath) + return + } + content := string(data) ret = &AssetParseResult{ Content: content, @@ -676,8 +686,70 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) { type PdfAssetParser struct { } -// Parse will parse a PDF document using PDFium webassembly module +// pdfPage struct defines a worker job for text extraction +type pdfPage struct { + pageNo int // page number for text extraction + data *[]byte // pointer to PDF document data +} + +// pdfTextResult struct defines the extracted PDF text result +type pdfTextResult struct { + pageNo int // page number of PDF document + text string // text of converted page + err error // processing error +} + +// getTextPageWorker will extract the text from a given PDF page and return its result +func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) { + defer instance.Close() + for pd := range page { + doc, err := instance.OpenDocument(&requests.OpenDocument{ + File: pd.data, + }) + if nil != err { + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + result <- &pdfTextResult{ + pageNo: pd.pageNo, + err: err, + } + continue + } + + req := &requests.GetPageText{ + Page: requests.Page{ + ByIndex: &requests.PageByIndex{ + Document: doc.Document, + Index: pd.pageNo, + }, + }, + } + res, err := instance.GetPageText(req) + if nil != err { + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + result <- &pdfTextResult{ + pageNo: pd.pageNo, + err: err, + } + continue + } + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + result <- &pdfTextResult{ + pageNo: pd.pageNo, + text: res.Text, + err: nil, + } + } +} + +// Parse will parse a PDF document using PDFium webassembly module using a worker pool func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { + now := time.Now() if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") { return } @@ -692,75 +764,103 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { } defer os.RemoveAll(tmp) - f, err := os.Open(tmp) - if nil != err { - logging.LogErrorf("open [%s] failed: [%s]", tmp, err) - return - } - defer f.Close() - - stat, err := f.Stat() + // PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible + pdfData, err := os.ReadFile(tmp) if nil != err { logging.LogErrorf("open [%s] failed: [%s]", tmp, err) return } - // initialize pdfium with one worker + // initialize go-pdfium with number of available cores + // we fire up the complete worker pool for maximum performance + cores := runtime.NumCPU() pool, err := webassembly.Init(webassembly.Config{ - MinIdle: 1, - MaxIdle: 1, - MaxTotal: 1, + MinIdle: cores, + MaxIdle: cores, + MaxTotal: cores, }) - if err != nil { + if nil != err { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } defer pool.Close() + // first get the number of PDF pages to convert into text instance, err := pool.GetInstance(time.Second * 30) - if err != nil { + if nil != err { logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } - defer instance.Close() - - // get number of pages inside PDF document doc, err := instance.OpenDocument(&requests.OpenDocument{ - FileReader: f, - FileReaderSize: stat.Size(), + File: &pdfData, }) - if err != nil { + if nil != err { + instance.Close() logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } - defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ - Document: doc.Document, - }) + pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) + if nil != err { + instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + instance.Close() + logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + return + } + instance.Close() - pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) - if err != nil { - logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + if PDFAssetContentMaxPage < pc.PageCount { + // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053 + logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount) return } - // loop through pages and get content - content := "" - for page := 0; page < pageCount.PageCount; page++ { - req := &requests.GetPageText{ - Page: requests.Page{ - ByIndex: &requests.PageByIndex{ - Document: doc.Document, - Index: page, - }, - }, - } - pt, err := instance.GetPageText(req) - if err != nil { + + // next setup worker pool for processing PDF pages + pages := make(chan *pdfPage, pc.PageCount) + results := make(chan *pdfTextResult, pc.PageCount) + for i := 0; i < cores; i++ { + inst, err := pool.GetInstance(time.Second * 30) + if nil != err { + close(pages) + close(results) logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) return } - content += " " + normalizeNonTxtAssetContent(pt.Text) + go parser.getTextPageWorker(i, inst, pages, results) } + // now split pages and let them process by worker pool + for p := 0; p < pc.PageCount; p++ { + pages <- &pdfPage{ + pageNo: p, + data: &pdfData, + } + } + close(pages) + + // finally fetch the PDF page text results + // Note: some workers will process pages faster than other workers depending on the page contents + // the order of returned PDF text pages is random and must be sorted using the pageNo index + pagetext := make([]string, pc.PageCount) + for p := 0; p < pc.PageCount; p++ { + res := <-results + pagetext[res.pageNo] = res.text + if nil != res.err { + logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err) + } + } + close(results) + + if 128 < pc.PageCount { + logging.LogInfof("convert [%s] PDF with [%d[ pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now)) + } + + // loop through ordered PDF text pages and join content for asset parse DB result + content := "" + for _, pt := range pagetext { + content += " " + normalizeNonTxtAssetContent(pt) + } ret = &AssetParseResult{ Content: content, }