Merge remote-tracking branch 'origin/dev' into dev

This commit is contained in:
Vanessa 2023-08-27 12:10:57 +08:00
commit 9fe5b6cb94
3 changed files with 226 additions and 42 deletions

81
kernel/api/archive.go Normal file
View file

@ -0,0 +1,81 @@
// SiYuan - Refactor your thinking
// Copyright (c) 2020-present, b3log.org
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package api
import (
"net/http"
"path/filepath"
"github.com/88250/gulu"
"github.com/gin-gonic/gin"
"github.com/siyuan-note/siyuan/kernel/util"
)
func zip(c *gin.Context) {
ret := gulu.Ret.NewResult()
defer c.JSON(http.StatusOK, ret)
arg, ok := util.JsonArg(c, ret)
if !ok {
return
}
path := arg["path"].(string)
zipPath := arg["zipPath"].(string)
zipFile, err := gulu.Zip.Create(zipPath)
if nil != err {
ret.Code = -1
ret.Msg = err.Error()
return
}
base := filepath.Base(path)
if gulu.File.IsDir(path) {
err = zipFile.AddDirectory(base, path)
} else {
err = zipFile.AddEntry(base, path)
}
if nil != err {
ret.Code = -1
ret.Msg = err.Error()
return
}
if err = zipFile.Close(); nil != err {
ret.Code = -1
ret.Msg = err.Error()
return
}
}
func unzip(c *gin.Context) {
ret := gulu.Ret.NewResult()
defer c.JSON(http.StatusOK, ret)
arg, ok := util.JsonArg(c, ret)
if !ok {
return
}
zipPath := arg["zipPath"].(string)
path := arg["path"].(string)
if err := gulu.Zip.Unzip(zipPath, path); nil != err {
ret.Code = -1
ret.Msg = err.Error()
return
}
}

View file

@ -377,4 +377,7 @@ func ServeAPI(ginServer *gin.Engine) {
ginServer.Handle("GET", "/api/broadcast/channels", model.CheckAuth, getChannels)
ginServer.Handle("POST", "/api/broadcast/postMessage", model.CheckAuth, postMessage)
ginServer.Handle("POST", "/api/broadcast/getChannelInfo", model.CheckAuth, getChannelInfo)
ginServer.Handle("POST", "/api/archive/zip", model.CheckAuth, zip)
ginServer.Handle("POST", "/api/archive/unzip", model.CheckAuth, unzip)
}

View file

@ -21,15 +21,18 @@ import (
"io/fs"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"code.sajari.com/docconv"
"github.com/88250/gulu"
"github.com/88250/lute/ast"
"github.com/dustin/go-humanize"
"github.com/klippa-app/go-pdfium"
"github.com/klippa-app/go-pdfium/requests"
"github.com/klippa-app/go-pdfium/webassembly"
"github.com/siyuan-note/eventbus"
@ -474,6 +477,7 @@ func NewAssetsSearcher() *AssetsSearcher {
const (
TxtAssetContentMaxSize = 1024 * 1024 * 4
PDFAssetContentMaxPage = 1024
)
type AssetParseResult struct {
@ -498,7 +502,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
if TxtAssetContentMaxSize < info.Size() {
logging.LogWarnf("file [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size())))
logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size())))
return
}
@ -514,6 +518,12 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
return
}
if !utf8.Valid(data) {
// Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath)
return
}
content := string(data)
ret = &AssetParseResult{
Content: content,
@ -676,8 +686,70 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
type PdfAssetParser struct {
}
// Parse will parse a PDF document using PDFium webassembly module
// pdfPage struct defines a worker job for text extraction
type pdfPage struct {
pageNo int // page number for text extraction
data *[]byte // pointer to PDF document data
}
// pdfTextResult struct defines the extracted PDF text result
type pdfTextResult struct {
pageNo int // page number of PDF document
text string // text of converted page
err error // processing error
}
// getTextPageWorker will extract the text from a given PDF page and return its result
func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) {
defer instance.Close()
for pd := range page {
doc, err := instance.OpenDocument(&requests.OpenDocument{
File: pd.data,
})
if nil != err {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
result <- &pdfTextResult{
pageNo: pd.pageNo,
err: err,
}
continue
}
req := &requests.GetPageText{
Page: requests.Page{
ByIndex: &requests.PageByIndex{
Document: doc.Document,
Index: pd.pageNo,
},
},
}
res, err := instance.GetPageText(req)
if nil != err {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
result <- &pdfTextResult{
pageNo: pd.pageNo,
err: err,
}
continue
}
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
result <- &pdfTextResult{
pageNo: pd.pageNo,
text: res.Text,
err: nil,
}
}
}
// Parse will parse a PDF document using PDFium webassembly module using a worker pool
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
now := time.Now()
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
return
}
@ -692,75 +764,103 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
defer os.RemoveAll(tmp)
f, err := os.Open(tmp)
if nil != err {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
defer f.Close()
stat, err := f.Stat()
// PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible
pdfData, err := os.ReadFile(tmp)
if nil != err {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
// initialize pdfium with one worker
// initialize go-pdfium with number of available cores
// we fire up the complete worker pool for maximum performance
cores := runtime.NumCPU()
pool, err := webassembly.Init(webassembly.Config{
MinIdle: 1,
MaxIdle: 1,
MaxTotal: 1,
MinIdle: cores,
MaxIdle: cores,
MaxTotal: cores,
})
if err != nil {
if nil != err {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer pool.Close()
// first get the number of PDF pages to convert into text
instance, err := pool.GetInstance(time.Second * 30)
if err != nil {
if nil != err {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer instance.Close()
// get number of pages inside PDF document
doc, err := instance.OpenDocument(&requests.OpenDocument{
FileReader: f,
FileReaderSize: stat.Size(),
File: &pdfData,
})
if err != nil {
if nil != err {
instance.Close()
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
if nil != err {
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
instance.Close()
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
instance.Close()
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
if PDFAssetContentMaxPage < pc.PageCount {
// PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053
logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount)
return
}
// loop through pages and get content
content := ""
for page := 0; page < pageCount.PageCount; page++ {
req := &requests.GetPageText{
Page: requests.Page{
ByIndex: &requests.PageByIndex{
Document: doc.Document,
Index: page,
},
},
}
pt, err := instance.GetPageText(req)
if err != nil {
// next setup worker pool for processing PDF pages
pages := make(chan *pdfPage, pc.PageCount)
results := make(chan *pdfTextResult, pc.PageCount)
for i := 0; i < cores; i++ {
inst, err := pool.GetInstance(time.Second * 30)
if nil != err {
close(pages)
close(results)
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
content += " " + normalizeNonTxtAssetContent(pt.Text)
go parser.getTextPageWorker(i, inst, pages, results)
}
// now split pages and let them process by worker pool
for p := 0; p < pc.PageCount; p++ {
pages <- &pdfPage{
pageNo: p,
data: &pdfData,
}
}
close(pages)
// finally fetch the PDF page text results
// Note: some workers will process pages faster than other workers depending on the page contents
// the order of returned PDF text pages is random and must be sorted using the pageNo index
pagetext := make([]string, pc.PageCount)
for p := 0; p < pc.PageCount; p++ {
res := <-results
pagetext[res.pageNo] = res.text
if nil != res.err {
logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err)
}
}
close(results)
if 128 < pc.PageCount {
logging.LogInfof("convert [%s] PDF with [%d[ pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now))
}
// loop through ordered PDF text pages and join content for asset parse DB result
content := ""
for _, pt := range pagetext {
content += " " + normalizeNonTxtAssetContent(pt)
}
ret = &AssetParseResult{
Content: content,
}