mirror of
https://github.com/siyuan-note/siyuan.git
synced 2025-12-16 22:50:13 +01:00
Support searching PDF asset content (#8985)
* feat(asset): add PDF parser * test: changed test function name to match parser * asset: add separator * asset: add pdf factory product * test: remove println --------- Co-authored-by: Heiko Besemann <heiko.besemann@qbeyond.de>
This commit is contained in:
parent
ec31ac0741
commit
19a295e157
5 changed files with 125 additions and 0 deletions
|
|
@ -24,11 +24,14 @@ import (
|
|||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"code.sajari.com/docconv"
|
||||
"github.com/88250/gulu"
|
||||
"github.com/88250/lute/ast"
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/klippa-app/go-pdfium/requests"
|
||||
"github.com/klippa-app/go-pdfium/webassembly"
|
||||
"github.com/siyuan-note/eventbus"
|
||||
"github.com/siyuan-note/filelock"
|
||||
"github.com/siyuan-note/logging"
|
||||
|
|
@ -416,6 +419,7 @@ func NewAssetsSearcher() *AssetsSearcher {
|
|||
".docx": &DocxAssetParser{},
|
||||
".pptx": &PptxAssetParser{},
|
||||
".xlsx": &XlsxAssetParser{},
|
||||
".pdf": &PdfAssetParser{},
|
||||
},
|
||||
|
||||
lock: &sync.Mutex{},
|
||||
|
|
@ -604,3 +608,98 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|||
}
|
||||
return
|
||||
}
|
||||
|
||||
// PdfAssetParser parser factory product
|
||||
type PdfAssetParser struct {
|
||||
}
|
||||
|
||||
// Parse will parse a PDF document using PDFium webassembly module
|
||||
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
||||
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
|
||||
return
|
||||
}
|
||||
|
||||
if !gulu.File.IsExist(absPath) {
|
||||
return
|
||||
}
|
||||
|
||||
tmp := copyTempAsset(absPath)
|
||||
if "" == tmp {
|
||||
return
|
||||
}
|
||||
defer os.RemoveAll(tmp)
|
||||
|
||||
f, err := os.Open(tmp)
|
||||
if nil != err {
|
||||
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
stat, err := f.Stat()
|
||||
if nil != err {
|
||||
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
|
||||
// initialize pdfium with one worker
|
||||
pool, err := webassembly.Init(webassembly.Config{
|
||||
MinIdle: 1,
|
||||
MaxIdle: 1,
|
||||
MaxTotal: 1,
|
||||
})
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer pool.Close()
|
||||
|
||||
instance, err := pool.GetInstance(time.Second * 30)
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer instance.Close()
|
||||
|
||||
// get number of pages inside PDF document
|
||||
doc, err := instance.OpenDocument(&requests.OpenDocument{
|
||||
FileReader: f,
|
||||
FileReaderSize: stat.Size(),
|
||||
})
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
||||
Document: doc.Document,
|
||||
})
|
||||
|
||||
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
// loop through pages and get content
|
||||
content := ""
|
||||
for page := 0; page < pageCount.PageCount; page++ {
|
||||
req := &requests.GetPageText{
|
||||
Page: requests.Page{
|
||||
ByIndex: &requests.PageByIndex{
|
||||
Document: doc.Document,
|
||||
Index: page,
|
||||
},
|
||||
},
|
||||
}
|
||||
pt, err := instance.GetPageText(req)
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
content += " " + normalizeAssetContent(pt.Text)
|
||||
}
|
||||
|
||||
ret = &AssetParseResult{
|
||||
Content: content,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue