diff --git a/kernel/go.mod b/kernel/go.mod index 2f0a4e97d..fbc17c621 100644 --- a/kernel/go.mod +++ b/kernel/go.mod @@ -64,6 +64,11 @@ require ( golang.org/x/text v0.12.0 ) +require ( + github.com/jolestar/go-commons-pool/v2 v2.1.2 // indirect + github.com/tetratelabs/wazero v1.3.1 // indirect +) + require ( dmitri.shuralyov.com/font/woff2 v0.0.0-20180220214647-957792cbbdab // indirect github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 // indirect @@ -111,6 +116,7 @@ require ( github.com/juju/errors v1.0.0 // indirect github.com/klauspost/compress v1.16.7 // indirect github.com/klauspost/cpuid/v2 v2.2.5 // indirect + github.com/klippa-app/go-pdfium v1.6.0 github.com/leodido/go-urn v1.2.4 // indirect github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a // indirect diff --git a/kernel/go.sum b/kernel/go.sum index 9fd28e9dd..3771bac46 100644 --- a/kernel/go.sum +++ b/kernel/go.sum @@ -173,6 +173,7 @@ github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA= github.com/fatih/set v0.2.1/go.mod h1:+RKtMCH+favT2+3YecHGxcc0b4KyVWA1QWWJUs4E0CI= github.com/flopp/go-findfont v0.1.0 h1:lPn0BymDUtJo+ZkV01VS3661HL6F4qFlkhcJN55u6mU= github.com/flopp/go-findfont v0.1.0/go.mod h1:wKKxRDjD024Rh7VMwoU90i6ikQRCr+JTHB5n4Ejkqvw= +github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY= github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= @@ -354,6 +355,8 @@ github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9Y github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/jolestar/go-commons-pool/v2 v2.1.2 h1:E+XGo58F23t7HtZiC/W6jzO2Ux2IccSH/yx4nD+J1CM= +github.com/jolestar/go-commons-pool/v2 v2.1.2/go.mod h1:r4NYccrkS5UqP1YQI1COyTZ9UjPJAAGTUxzcsK1kqhY= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= @@ -373,6 +376,8 @@ github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgo github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg= github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= +github.com/klippa-app/go-pdfium v1.6.0 h1:swz+bKYsrRSuPrczot2cE/FoR/1h13R8CjBOv2RcDm4= +github.com/klippa-app/go-pdfium v1.6.0/go.mod h1:Lh8U8bQ+Idxz3e89+0u59j64YTPaO3G5JbvRImVqIio= github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= @@ -538,6 +543,8 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/studio-b12/gowebdav v0.9.0 h1:1j1sc9gQnNxbXXM4M/CebPOX4aXYtr7MojAVcN4dHjU= github.com/studio-b12/gowebdav v0.9.0/go.mod h1:bHA7t77X/QFExdeAnDzK6vKM34kEZAcE1OX4MfiwjkE= +github.com/tetratelabs/wazero v1.3.1 h1:rnb9FgOEQRLLR8tgoD1mfjNjMhFeWRUk+a4b4j/GpUM= +github.com/tetratelabs/wazero v1.3.1/go.mod h1:wYx2gNRg8/WihJfSDxA1TIL8H+GkfLYm+bIfbblu9VQ= github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= github.com/tklauser/go-sysconf v0.3.11/go.mod h1:GqXfhXY3kiPa0nAXPDIQIWzJbMCB7AmcWpGR8lSZfqI= github.com/tklauser/numcpus v0.6.0/go.mod h1:FEZLMke0lhOUG6w2JadTzp0a+Nl8PF/GFkQ5UVIcaL4= diff --git a/kernel/model/asset_content.go b/kernel/model/asset_content.go index 622139168..34230845e 100644 --- a/kernel/model/asset_content.go +++ b/kernel/model/asset_content.go @@ -24,11 +24,14 @@ import ( "strconv" "strings" "sync" + "time" "code.sajari.com/docconv" "github.com/88250/gulu" "github.com/88250/lute/ast" "github.com/dustin/go-humanize" + "github.com/klippa-app/go-pdfium/requests" + "github.com/klippa-app/go-pdfium/webassembly" "github.com/siyuan-note/eventbus" "github.com/siyuan-note/filelock" "github.com/siyuan-note/logging" @@ -416,6 +419,7 @@ func NewAssetsSearcher() *AssetsSearcher { ".docx": &DocxAssetParser{}, ".pptx": &PptxAssetParser{}, ".xlsx": &XlsxAssetParser{}, + ".pdf": &PdfAssetParser{}, }, lock: &sync.Mutex{}, @@ -604,3 +608,98 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) { } return } + +// PdfAssetParser parser factory product +type PdfAssetParser struct { +} + +// Parse will parse a PDF document using PDFium webassembly module +func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) { + if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") { + return + } + + if !gulu.File.IsExist(absPath) { + return + } + + tmp := copyTempAsset(absPath) + if "" == tmp { + return + } + defer os.RemoveAll(tmp) + + f, err := os.Open(tmp) + if nil != err { + logging.LogErrorf("open [%s] failed: [%s]", tmp, err) + return + } + defer f.Close() + + stat, err := f.Stat() + if nil != err { + logging.LogErrorf("open [%s] failed: [%s]", tmp, err) + return + } + + // initialize pdfium with one worker + pool, err := webassembly.Init(webassembly.Config{ + MinIdle: 1, + MaxIdle: 1, + MaxTotal: 1, + }) + if err != nil { + logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + return + } + defer pool.Close() + + instance, err := pool.GetInstance(time.Second * 30) + if err != nil { + logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + return + } + defer instance.Close() + + // get number of pages inside PDF document + doc, err := instance.OpenDocument(&requests.OpenDocument{ + FileReader: f, + FileReaderSize: stat.Size(), + }) + if err != nil { + logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + return + } + defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{ + Document: doc.Document, + }) + + pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document}) + if err != nil { + logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + return + } + // loop through pages and get content + content := "" + for page := 0; page < pageCount.PageCount; page++ { + req := &requests.GetPageText{ + Page: requests.Page{ + ByIndex: &requests.PageByIndex{ + Document: doc.Document, + Index: page, + }, + }, + } + pt, err := instance.GetPageText(req) + if err != nil { + logging.LogErrorf("convert [%s] failed: [%s]", tmp, err) + return + } + content += " " + normalizeAssetContent(pt.Text) + } + + ret = &AssetParseResult{ + Content: content, + } + return +} diff --git a/kernel/model/asset_content_test.go b/kernel/model/asset_content_test.go new file mode 100644 index 000000000..7af59ed18 --- /dev/null +++ b/kernel/model/asset_content_test.go @@ -0,0 +1,13 @@ +package model + +import ( + "testing" +) + +func TestPDFParser(t *testing.T) { + p := &PdfAssetParser{} + res := p.Parse("../../testdata/parsertest.pdf") + if res == nil || res.Content == "" { + t.Fatalf("empty or nil PDF content result") + } +} diff --git a/testdata/parsertest.pdf b/testdata/parsertest.pdf new file mode 100644 index 000000000..1d731130f Binary files /dev/null and b/testdata/parsertest.pdf differ