mirror of
https://github.com/siyuan-note/siyuan.git
synced 2026-01-04 15:58:49 +01:00
Merge remote-tracking branch 'origin/dev' into dev
This commit is contained in:
commit
9fe5b6cb94
3 changed files with 226 additions and 42 deletions
81
kernel/api/archive.go
Normal file
81
kernel/api/archive.go
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
// SiYuan - Refactor your thinking
|
||||
// Copyright (c) 2020-present, b3log.org
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/88250/gulu"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/siyuan-note/siyuan/kernel/util"
|
||||
)
|
||||
|
||||
func zip(c *gin.Context) {
|
||||
ret := gulu.Ret.NewResult()
|
||||
defer c.JSON(http.StatusOK, ret)
|
||||
|
||||
arg, ok := util.JsonArg(c, ret)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
path := arg["path"].(string)
|
||||
zipPath := arg["zipPath"].(string)
|
||||
zipFile, err := gulu.Zip.Create(zipPath)
|
||||
if nil != err {
|
||||
ret.Code = -1
|
||||
ret.Msg = err.Error()
|
||||
return
|
||||
}
|
||||
|
||||
base := filepath.Base(path)
|
||||
if gulu.File.IsDir(path) {
|
||||
err = zipFile.AddDirectory(base, path)
|
||||
} else {
|
||||
err = zipFile.AddEntry(base, path)
|
||||
}
|
||||
if nil != err {
|
||||
ret.Code = -1
|
||||
ret.Msg = err.Error()
|
||||
return
|
||||
}
|
||||
|
||||
if err = zipFile.Close(); nil != err {
|
||||
ret.Code = -1
|
||||
ret.Msg = err.Error()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func unzip(c *gin.Context) {
|
||||
ret := gulu.Ret.NewResult()
|
||||
defer c.JSON(http.StatusOK, ret)
|
||||
|
||||
arg, ok := util.JsonArg(c, ret)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
zipPath := arg["zipPath"].(string)
|
||||
path := arg["path"].(string)
|
||||
if err := gulu.Zip.Unzip(zipPath, path); nil != err {
|
||||
ret.Code = -1
|
||||
ret.Msg = err.Error()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
|
@ -377,4 +377,7 @@ func ServeAPI(ginServer *gin.Engine) {
|
|||
ginServer.Handle("GET", "/api/broadcast/channels", model.CheckAuth, getChannels)
|
||||
ginServer.Handle("POST", "/api/broadcast/postMessage", model.CheckAuth, postMessage)
|
||||
ginServer.Handle("POST", "/api/broadcast/getChannelInfo", model.CheckAuth, getChannelInfo)
|
||||
|
||||
ginServer.Handle("POST", "/api/archive/zip", model.CheckAuth, zip)
|
||||
ginServer.Handle("POST", "/api/archive/unzip", model.CheckAuth, unzip)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,15 +21,18 @@ import (
|
|||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"code.sajari.com/docconv"
|
||||
"github.com/88250/gulu"
|
||||
"github.com/88250/lute/ast"
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/klippa-app/go-pdfium"
|
||||
"github.com/klippa-app/go-pdfium/requests"
|
||||
"github.com/klippa-app/go-pdfium/webassembly"
|
||||
"github.com/siyuan-note/eventbus"
|
||||
|
|
@ -474,6 +477,7 @@ func NewAssetsSearcher() *AssetsSearcher {
|
|||
|
||||
const (
|
||||
TxtAssetContentMaxSize = 1024 * 1024 * 4
|
||||
PDFAssetContentMaxPage = 1024
|
||||
)
|
||||
|
||||
type AssetParseResult struct {
|
||||
|
|
@ -498,7 +502,7 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|||
}
|
||||
|
||||
if TxtAssetContentMaxSize < info.Size() {
|
||||
logging.LogWarnf("file [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size())))
|
||||
logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size())))
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -514,6 +518,12 @@ func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|||
return
|
||||
}
|
||||
|
||||
if !utf8.Valid(data) {
|
||||
// Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
|
||||
logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath)
|
||||
return
|
||||
}
|
||||
|
||||
content := string(data)
|
||||
ret = &AssetParseResult{
|
||||
Content: content,
|
||||
|
|
@ -676,8 +686,70 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|||
type PdfAssetParser struct {
|
||||
}
|
||||
|
||||
// Parse will parse a PDF document using PDFium webassembly module
|
||||
// pdfPage struct defines a worker job for text extraction
|
||||
type pdfPage struct {
|
||||
pageNo int // page number for text extraction
|
||||
data *[]byte // pointer to PDF document data
|
||||
}
|
||||
|
||||
// pdfTextResult struct defines the extracted PDF text result
|
||||
type pdfTextResult struct {
|
||||
pageNo int // page number of PDF document
|
||||
text string // text of converted page
|
||||
err error // processing error
|
||||
}
|
||||
|
||||
// getTextPageWorker will extract the text from a given PDF page and return its result
|
||||
func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) {
|
||||
defer instance.Close()
|
||||
for pd := range page {
|
||||
doc, err := instance.OpenDocument(&requests.OpenDocument{
|
||||
File: pd.data,
|
||||
})
|
||||
if nil != err {
|
||||
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
||||
Document: doc.Document,
|
||||
})
|
||||
result <- &pdfTextResult{
|
||||
pageNo: pd.pageNo,
|
||||
err: err,
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
req := &requests.GetPageText{
|
||||
Page: requests.Page{
|
||||
ByIndex: &requests.PageByIndex{
|
||||
Document: doc.Document,
|
||||
Index: pd.pageNo,
|
||||
},
|
||||
},
|
||||
}
|
||||
res, err := instance.GetPageText(req)
|
||||
if nil != err {
|
||||
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
||||
Document: doc.Document,
|
||||
})
|
||||
result <- &pdfTextResult{
|
||||
pageNo: pd.pageNo,
|
||||
err: err,
|
||||
}
|
||||
continue
|
||||
}
|
||||
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
||||
Document: doc.Document,
|
||||
})
|
||||
result <- &pdfTextResult{
|
||||
pageNo: pd.pageNo,
|
||||
text: res.Text,
|
||||
err: nil,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parse will parse a PDF document using PDFium webassembly module using a worker pool
|
||||
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
||||
now := time.Now()
|
||||
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
|
||||
return
|
||||
}
|
||||
|
|
@ -692,75 +764,103 @@ func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|||
}
|
||||
defer os.RemoveAll(tmp)
|
||||
|
||||
f, err := os.Open(tmp)
|
||||
if nil != err {
|
||||
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
stat, err := f.Stat()
|
||||
// PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible
|
||||
pdfData, err := os.ReadFile(tmp)
|
||||
if nil != err {
|
||||
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
|
||||
// initialize pdfium with one worker
|
||||
// initialize go-pdfium with number of available cores
|
||||
// we fire up the complete worker pool for maximum performance
|
||||
cores := runtime.NumCPU()
|
||||
pool, err := webassembly.Init(webassembly.Config{
|
||||
MinIdle: 1,
|
||||
MaxIdle: 1,
|
||||
MaxTotal: 1,
|
||||
MinIdle: cores,
|
||||
MaxIdle: cores,
|
||||
MaxTotal: cores,
|
||||
})
|
||||
if err != nil {
|
||||
if nil != err {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer pool.Close()
|
||||
|
||||
// first get the number of PDF pages to convert into text
|
||||
instance, err := pool.GetInstance(time.Second * 30)
|
||||
if err != nil {
|
||||
if nil != err {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer instance.Close()
|
||||
|
||||
// get number of pages inside PDF document
|
||||
doc, err := instance.OpenDocument(&requests.OpenDocument{
|
||||
FileReader: f,
|
||||
FileReaderSize: stat.Size(),
|
||||
File: &pdfData,
|
||||
})
|
||||
if err != nil {
|
||||
if nil != err {
|
||||
instance.Close()
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
||||
Document: doc.Document,
|
||||
})
|
||||
pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
|
||||
if nil != err {
|
||||
instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
||||
Document: doc.Document,
|
||||
})
|
||||
instance.Close()
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
instance.Close()
|
||||
|
||||
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
if PDFAssetContentMaxPage < pc.PageCount {
|
||||
// PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053
|
||||
logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount)
|
||||
return
|
||||
}
|
||||
// loop through pages and get content
|
||||
content := ""
|
||||
for page := 0; page < pageCount.PageCount; page++ {
|
||||
req := &requests.GetPageText{
|
||||
Page: requests.Page{
|
||||
ByIndex: &requests.PageByIndex{
|
||||
Document: doc.Document,
|
||||
Index: page,
|
||||
},
|
||||
},
|
||||
}
|
||||
pt, err := instance.GetPageText(req)
|
||||
if err != nil {
|
||||
|
||||
// next setup worker pool for processing PDF pages
|
||||
pages := make(chan *pdfPage, pc.PageCount)
|
||||
results := make(chan *pdfTextResult, pc.PageCount)
|
||||
for i := 0; i < cores; i++ {
|
||||
inst, err := pool.GetInstance(time.Second * 30)
|
||||
if nil != err {
|
||||
close(pages)
|
||||
close(results)
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
content += " " + normalizeNonTxtAssetContent(pt.Text)
|
||||
go parser.getTextPageWorker(i, inst, pages, results)
|
||||
}
|
||||
|
||||
// now split pages and let them process by worker pool
|
||||
for p := 0; p < pc.PageCount; p++ {
|
||||
pages <- &pdfPage{
|
||||
pageNo: p,
|
||||
data: &pdfData,
|
||||
}
|
||||
}
|
||||
close(pages)
|
||||
|
||||
// finally fetch the PDF page text results
|
||||
// Note: some workers will process pages faster than other workers depending on the page contents
|
||||
// the order of returned PDF text pages is random and must be sorted using the pageNo index
|
||||
pagetext := make([]string, pc.PageCount)
|
||||
for p := 0; p < pc.PageCount; p++ {
|
||||
res := <-results
|
||||
pagetext[res.pageNo] = res.text
|
||||
if nil != res.err {
|
||||
logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err)
|
||||
}
|
||||
}
|
||||
close(results)
|
||||
|
||||
if 128 < pc.PageCount {
|
||||
logging.LogInfof("convert [%s] PDF with [%d[ pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now))
|
||||
}
|
||||
|
||||
// loop through ordered PDF text pages and join content for asset parse DB result
|
||||
content := ""
|
||||
for _, pt := range pagetext {
|
||||
content += " " + normalizeNonTxtAssetContent(pt)
|
||||
}
|
||||
ret = &AssetParseResult{
|
||||
Content: content,
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue