Support searching PDF asset content (#8985)

* feat(asset): add PDF parser

* test: changed test function name to match parser

* asset: add separator

* asset: add pdf factory product

* test: remove println

---------

Co-authored-by: Heiko Besemann <heiko.besemann@qbeyond.de>
This commit is contained in:
nekrondev 2023-08-17 05:52:59 +02:00 committed by GitHub
parent ec31ac0741
commit 19a295e157
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 125 additions and 0 deletions

View file

@ -24,11 +24,14 @@ import (
"strconv"
"strings"
"sync"
"time"
"code.sajari.com/docconv"
"github.com/88250/gulu"
"github.com/88250/lute/ast"
"github.com/dustin/go-humanize"
"github.com/klippa-app/go-pdfium/requests"
"github.com/klippa-app/go-pdfium/webassembly"
"github.com/siyuan-note/eventbus"
"github.com/siyuan-note/filelock"
"github.com/siyuan-note/logging"
@ -416,6 +419,7 @@ func NewAssetsSearcher() *AssetsSearcher {
".docx": &DocxAssetParser{},
".pptx": &PptxAssetParser{},
".xlsx": &XlsxAssetParser{},
".pdf": &PdfAssetParser{},
},
lock: &sync.Mutex{},
@ -604,3 +608,98 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
return
}
// PdfAssetParser parser factory product
type PdfAssetParser struct {
}
// Parse will parse a PDF document using PDFium webassembly module
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
return
}
if !gulu.File.IsExist(absPath) {
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
f, err := os.Open(tmp)
if nil != err {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
defer f.Close()
stat, err := f.Stat()
if nil != err {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
// initialize pdfium with one worker
pool, err := webassembly.Init(webassembly.Config{
MinIdle: 1,
MaxIdle: 1,
MaxTotal: 1,
})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer pool.Close()
instance, err := pool.GetInstance(time.Second * 30)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer instance.Close()
// get number of pages inside PDF document
doc, err := instance.OpenDocument(&requests.OpenDocument{
FileReader: f,
FileReaderSize: stat.Size(),
})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
// loop through pages and get content
content := ""
for page := 0; page < pageCount.PageCount; page++ {
req := &requests.GetPageText{
Page: requests.Page{
ByIndex: &requests.PageByIndex{
Document: doc.Document,
Index: page,
},
},
}
pt, err := instance.GetPageText(req)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
content += " " + normalizeAssetContent(pt.Text)
}
ret = &AssetParseResult{
Content: content,
}
return
}