From 0beca364fa60a348ab073e0d03e34841fd92a56d Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sat, 26 Aug 2023 22:38:02 +0800
Subject: [PATCH 1/9] :art: Add kernel API `/api/archive/zip` and
`/api/archive/unzip` Fix https://github.com/siyuan-note/siyuan/issues/9028
---
kernel/api/archive.go | 81 +++++++++++++++++++++++++++++++++++++++++++
kernel/api/router.go | 3 ++
2 files changed, 84 insertions(+)
create mode 100644 kernel/api/archive.go
diff --git a/kernel/api/archive.go b/kernel/api/archive.go
new file mode 100644
index 000000000..835262b3f
--- /dev/null
+++ b/kernel/api/archive.go
@@ -0,0 +1,81 @@
+// SiYuan - Refactor your thinking
+// Copyright (c) 2020-present, b3log.org
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package api
+
+import (
+ "net/http"
+ "path/filepath"
+
+ "github.com/88250/gulu"
+ "github.com/gin-gonic/gin"
+ "github.com/siyuan-note/siyuan/kernel/util"
+)
+
+func zip(c *gin.Context) {
+ ret := gulu.Ret.NewResult()
+ defer c.JSON(http.StatusOK, ret)
+
+ arg, ok := util.JsonArg(c, ret)
+ if !ok {
+ return
+ }
+
+ path := arg["path"].(string)
+ zipPath := arg["zipPath"].(string)
+ zipFile, err := gulu.Zip.Create(zipPath)
+ if nil != err {
+ ret.Code = -1
+ ret.Msg = err.Error()
+ return
+ }
+
+ base := filepath.Base(path)
+ if gulu.File.IsDir(path) {
+ err = zipFile.AddDirectory(base, path)
+ } else {
+ err = zipFile.AddEntry(base, path)
+ }
+ if nil != err {
+ ret.Code = -1
+ ret.Msg = err.Error()
+ return
+ }
+
+ if err = zipFile.Close(); nil != err {
+ ret.Code = -1
+ ret.Msg = err.Error()
+ return
+ }
+}
+
+func unzip(c *gin.Context) {
+ ret := gulu.Ret.NewResult()
+ defer c.JSON(http.StatusOK, ret)
+
+ arg, ok := util.JsonArg(c, ret)
+ if !ok {
+ return
+ }
+
+ zipPath := arg["zipPath"].(string)
+ path := arg["path"].(string)
+ if err := gulu.Zip.Unzip(zipPath, path); nil != err {
+ ret.Code = -1
+ ret.Msg = err.Error()
+ return
+ }
+}
diff --git a/kernel/api/router.go b/kernel/api/router.go
index f2cd93dc6..0d1707762 100644
--- a/kernel/api/router.go
+++ b/kernel/api/router.go
@@ -377,4 +377,7 @@ func ServeAPI(ginServer *gin.Engine) {
ginServer.Handle("GET", "/api/broadcast/channels", model.CheckAuth, getChannels)
ginServer.Handle("POST", "/api/broadcast/postMessage", model.CheckAuth, postMessage)
ginServer.Handle("POST", "/api/broadcast/getChannelInfo", model.CheckAuth, getChannelInfo)
+
+ ginServer.Handle("POST", "/api/archive/zip", model.CheckAuth, zip)
+ ginServer.Handle("POST", "/api/archive/unzip", model.CheckAuth, unzip)
}
From f4e840fae6c46cab8a03c89249f6d59913005750 Mon Sep 17 00:00:00 2001
From: nekrondev
Date: Sat, 26 Aug 2023 16:44:14 +0200
Subject: [PATCH 2/9] feat(assets): improve PDF asset parser performance
(#9051)
This commit will change the single-threaded behavior of
PDF parser into multi-threaded worker pool
speeding up PDF parsing into text
Co-authored-by: Heiko Besemann
---
kernel/model/asset_content.go | 161 ++++++++++++++++++++++++++--------
1 file changed, 122 insertions(+), 39 deletions(-)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index 9ee5f9a08..2ba193b8e 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -21,6 +21,7 @@ import (
"io/fs"
"os"
"path/filepath"
+ "runtime"
"strconv"
"strings"
"sync"
@@ -30,6 +31,7 @@ import (
"github.com/88250/gulu"
"github.com/88250/lute/ast"
"github.com/dustin/go-humanize"
+ "github.com/klippa-app/go-pdfium"
"github.com/klippa-app/go-pdfium/requests"
"github.com/klippa-app/go-pdfium/webassembly"
"github.com/siyuan-note/eventbus"
@@ -676,8 +678,70 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
type PdfAssetParser struct {
}
-// Parse will parse a PDF document using PDFium webassembly module
+// pdfPage struct defines a worker job for text extraction
+type pdfPage struct {
+ pageNo int // page number for text extraction
+ data *[]byte // pointer to PDF document data
+}
+
+// pdfTextResult struct defines the extracted PDF text result
+type pdfTextResult struct {
+ pageNo int // page number of PDF document
+ text string // text of converted page
+ err error // processing error
+}
+
+// getTextPageWorker will extract the text from a given PDF page and return its result
+func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) {
+ defer instance.Close()
+ for pd := range page {
+ doc, err := instance.OpenDocument(&requests.OpenDocument{
+ File: pd.data,
+ })
+ if err != nil {
+ instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
+ Document: doc.Document,
+ })
+ result <- &pdfTextResult{
+ pageNo: pd.pageNo,
+ err: err,
+ }
+ continue
+ }
+
+ req := &requests.GetPageText{
+ Page: requests.Page{
+ ByIndex: &requests.PageByIndex{
+ Document: doc.Document,
+ Index: pd.pageNo,
+ },
+ },
+ }
+ res, err := instance.GetPageText(req)
+ if err != nil {
+ instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
+ Document: doc.Document,
+ })
+ result <- &pdfTextResult{
+ pageNo: pd.pageNo,
+ err: err,
+ }
+ continue
+ }
+ instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
+ Document: doc.Document,
+ })
+ result <- &pdfTextResult{
+ pageNo: pd.pageNo,
+ text: res.Text,
+ err: nil,
+ }
+ }
+}
+
+// Parse will parse a PDF document using PDFium webassembly module using a worker pool
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
+ st := time.Now()
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
return
}
@@ -692,24 +756,20 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
defer os.RemoveAll(tmp)
- f, err := os.Open(tmp)
- if nil != err {
- logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
- return
- }
- defer f.Close()
-
- stat, err := f.Stat()
+ // PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible
+ pdfData, err := os.ReadFile(tmp)
if nil != err {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
- // initialize pdfium with one worker
+ // initialize go-pdfium with number of available cores
+ // we fire up the complete worker pool for maximum performance
+ cores := runtime.NumCPU()
pool, err := webassembly.Init(webassembly.Config{
- MinIdle: 1,
- MaxIdle: 1,
- MaxTotal: 1,
+ MinIdle: cores,
+ MaxIdle: cores,
+ MaxTotal: cores,
})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
@@ -717,50 +777,73 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
defer pool.Close()
+ // first get the number of PDF pages to convert into text
instance, err := pool.GetInstance(time.Second * 30)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
- defer instance.Close()
-
- // get number of pages inside PDF document
doc, err := instance.OpenDocument(&requests.OpenDocument{
- FileReader: f,
- FileReaderSize: stat.Size(),
+ File: &pdfData,
})
if err != nil {
+ instance.Close()
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
- defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
- Document: doc.Document,
- })
+ pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
+ if err != nil {
+ instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
+ Document: doc.Document,
+ })
+ instance.Close()
+ logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
+ return
+ }
+ instance.Close()
- pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
- if err != nil {
- logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
- return
- }
- // loop through pages and get content
- content := ""
- for page := 0; page < pageCount.PageCount; page++ {
- req := &requests.GetPageText{
- Page: requests.Page{
- ByIndex: &requests.PageByIndex{
- Document: doc.Document,
- Index: page,
- },
- },
- }
- pt, err := instance.GetPageText(req)
+ // next setup worker pool for processing PDF pages
+ pages := make(chan *pdfPage, pc.PageCount)
+ results := make(chan *pdfTextResult, pc.PageCount)
+ for i := 0; i < cores; i++ {
+ inst, err := pool.GetInstance(time.Second * 30)
if err != nil {
+ close(pages)
+ close(results)
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
- content += " " + normalizeNonTxtAssetContent(pt.Text)
+ go parser.getTextPageWorker(i, inst, pages, results)
}
+ // now split pages and let them process by worker pool
+ for p := 0; p < pc.PageCount; p++ {
+ pages <- &pdfPage{
+ pageNo: p,
+ data: &pdfData,
+ }
+ }
+ close(pages)
+
+ // finally fetch the PDF page text results
+ // Note: some workers will process pages faster than other workers depending on the page contents
+ // the order of returned PDF text pages is random and must be sorted using the pageNo index
+ pagetext := make([]string, pc.PageCount)
+ for p := 0; p < pc.PageCount; p++ {
+ res := <-results
+ pagetext[res.pageNo] = res.text
+ if res.err != nil {
+ logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err)
+ }
+ }
+ close(results)
+ logging.LogInfof("convert [%s] PDF with %d pages using %d workers took %s.\n", tmp, pc.PageCount, cores, time.Since(st))
+
+ // loop through ordered PDF text pages and join content for asset parse DB result
+ content := ""
+ for _, pt := range pagetext {
+ content += " " + normalizeNonTxtAssetContent(pt)
+ }
ret = &AssetParseResult{
Content: content,
}
From 4baeeed1ecb7454e532fb5f4be72fca855b29585 Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sun, 27 Aug 2023 10:53:48 +0800
Subject: [PATCH 3/9] :art: Non-UTF-8 encoded text files are not included in
asset file content searching Fix
https://github.com/siyuan-note/siyuan/issues/9052
---
kernel/model/asset_content.go | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index 2ba193b8e..c5ac3c7d9 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -26,6 +26,7 @@ import (
"strings"
"sync"
"time"
+ "unicode/utf8"
"code.sajari.com/docconv"
"github.com/88250/gulu"
@@ -516,6 +517,11 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
return
}
+ if !utf8.Valid(data) {
+ // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
+ return
+ }
+
content := string(data)
ret = &AssetParseResult{
Content: content,
From fd32668abc29350046fa521df777795ff0431052 Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sun, 27 Aug 2023 11:02:36 +0800
Subject: [PATCH 4/9] :art: PDF files longer than 1024 pages are not included
in asset file content searching
https://github.com/siyuan-note/siyuan/issues/9053
---
kernel/model/asset_content.go | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index c5ac3c7d9..ff5c150ac 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -519,6 +519,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !utf8.Valid(data) {
// Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
+ logging.LogWarnf("asset [%s] is not UTF-8 encoded", absPath)
return
}
@@ -808,6 +809,12 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
instance.Close()
+ if 1024 < pc.PageCount {
+ // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053
+ logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount)
+ return
+ }
+
// next setup worker pool for processing PDF pages
pages := make(chan *pdfPage, pc.PageCount)
results := make(chan *pdfTextResult, pc.PageCount)
From b578506ea5e0cc70093b78a077e612816217b632 Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sun, 27 Aug 2023 11:09:19 +0800
Subject: [PATCH 5/9] :art: PDF files longer than 1024 pages are not included
in asset file content searching
https://github.com/siyuan-note/siyuan/issues/9053
---
kernel/model/asset_content.go | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index ff5c150ac..426d94d53 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -748,7 +748,7 @@ func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium,
// Parse will parse a PDF document using PDFium webassembly module using a worker pool
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
- st := time.Now()
+ now := time.Now()
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
return
}
@@ -850,7 +850,10 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
}
close(results)
- logging.LogInfof("convert [%s] PDF with %d pages using %d workers took %s.\n", tmp, pc.PageCount, cores, time.Since(st))
+
+ if 256 < pc.PageCount {
+ logging.LogInfof("convert [%s] PDF with [%d[ pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now))
+ }
// loop through ordered PDF text pages and join content for asset parse DB result
content := ""
From 9cfcec231004defc8bd1e3608c5b206fc55a2674 Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sun, 27 Aug 2023 11:09:37 +0800
Subject: [PATCH 6/9] :art: PDF files longer than 1024 pages are not included
in asset file content searching
https://github.com/siyuan-note/siyuan/issues/9053
---
kernel/model/asset_content.go | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index 426d94d53..374eec0c5 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -705,7 +705,7 @@ func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium,
doc, err := instance.OpenDocument(&requests.OpenDocument{
File: pd.data,
})
- if err != nil {
+ if nil != err {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
@@ -725,7 +725,7 @@ func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium,
},
}
res, err := instance.GetPageText(req)
- if err != nil {
+ if nil != err {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
@@ -778,7 +778,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
MaxIdle: cores,
MaxTotal: cores,
})
- if err != nil {
+ if nil != err {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
@@ -786,20 +786,20 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
// first get the number of PDF pages to convert into text
instance, err := pool.GetInstance(time.Second * 30)
- if err != nil {
+ if nil != err {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
doc, err := instance.OpenDocument(&requests.OpenDocument{
File: &pdfData,
})
- if err != nil {
+ if nil != err {
instance.Close()
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
- if err != nil {
+ if nil != err {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
@@ -820,7 +820,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
results := make(chan *pdfTextResult, pc.PageCount)
for i := 0; i < cores; i++ {
inst, err := pool.GetInstance(time.Second * 30)
- if err != nil {
+ if nil != err {
close(pages)
close(results)
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
@@ -845,7 +845,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
for p := 0; p < pc.PageCount; p++ {
res := <-results
pagetext[res.pageNo] = res.text
- if res.err != nil {
+ if res.nil != err {
logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err)
}
}
From 184b4aa0746ef4ed8d411df5b2ddc08748225aa5 Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sun, 27 Aug 2023 11:12:11 +0800
Subject: [PATCH 7/9] :art: PDF files longer than 1024 pages are not included
in asset file content searching
https://github.com/siyuan-note/siyuan/issues/9053
---
kernel/model/asset_content.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index 374eec0c5..34b80ca71 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -845,7 +845,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
for p := 0; p < pc.PageCount; p++ {
res := <-results
pagetext[res.pageNo] = res.text
- if res.nil != err {
+ if nil != res.err {
logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err)
}
}
From 7d6c101813a7c0de2c59214cbf7b78a13461f224 Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sun, 27 Aug 2023 11:13:39 +0800
Subject: [PATCH 8/9] :art: PDF files longer than 1024 pages are not included
in asset file content searching
https://github.com/siyuan-note/siyuan/issues/9053
---
kernel/model/asset_content.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index 34b80ca71..49ec49bdc 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -851,7 +851,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
close(results)
- if 256 < pc.PageCount {
+ if 128 < pc.PageCount {
logging.LogInfof("convert [%s] PDF with [%d[ pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now))
}
From d3fa2bc5470e299f0e3bf716615b0409ce0e578f Mon Sep 17 00:00:00 2001
From: Daniel <845765@qq.com>
Date: Sun, 27 Aug 2023 11:17:53 +0800
Subject: [PATCH 9/9] :art: PDF files longer than 1024 pages are not included
in asset file content searching
https://github.com/siyuan-note/siyuan/issues/9053
---
kernel/model/asset_content.go | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go
index 49ec49bdc..cdb8fd09a 100644
--- a/kernel/model/asset_content.go
+++ b/kernel/model/asset_content.go
@@ -477,6 +477,7 @@ func NewAssetsSearcher() *AssetsSearcher {
const (
TxtAssetContentMaxSize = 1024 * 1024 * 4
+ PDFAssetContentMaxPage = 1024
)
type AssetParseResult struct {
@@ -501,7 +502,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
if TxtAssetContentMaxSize < info.Size() {
- logging.LogWarnf("file [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size())))
+ logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size())))
return
}
@@ -519,7 +520,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !utf8.Valid(data) {
// Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
- logging.LogWarnf("asset [%s] is not UTF-8 encoded", absPath)
+ logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath)
return
}
@@ -809,7 +810,7 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
instance.Close()
- if 1024 < pc.PageCount {
+ if PDFAssetContentMaxPage < pc.PageCount {
// PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053
logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount)
return