2023-08-04 12:05:29 +08:00
|
|
|
|
// SiYuan - Refactor your thinking
|
|
|
|
|
|
// Copyright (c) 2020-present, b3log.org
|
|
|
|
|
|
//
|
|
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
|
// (at your option) any later version.
|
|
|
|
|
|
//
|
|
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
|
|
//
|
|
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
|
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
|
package model
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
2023-08-09 21:14:04 +08:00
|
|
|
|
"bytes"
|
2023-08-04 12:29:32 +08:00
|
|
|
|
"io/fs"
|
|
|
|
|
|
"os"
|
|
|
|
|
|
"path/filepath"
|
2023-08-11 00:01:31 +08:00
|
|
|
|
"strconv"
|
2023-08-04 12:29:32 +08:00
|
|
|
|
"strings"
|
|
|
|
|
|
"sync"
|
2023-08-17 05:52:59 +02:00
|
|
|
|
"time"
|
2023-08-04 12:29:32 +08:00
|
|
|
|
|
2023-08-09 21:14:04 +08:00
|
|
|
|
"code.sajari.com/docconv"
|
2023-08-04 12:05:29 +08:00
|
|
|
|
"github.com/88250/gulu"
|
|
|
|
|
|
"github.com/88250/lute/ast"
|
2023-08-15 09:34:58 +08:00
|
|
|
|
"github.com/dustin/go-humanize"
|
2023-08-17 05:52:59 +02:00
|
|
|
|
"github.com/klippa-app/go-pdfium/requests"
|
|
|
|
|
|
"github.com/klippa-app/go-pdfium/webassembly"
|
2023-08-04 12:05:29 +08:00
|
|
|
|
"github.com/siyuan-note/eventbus"
|
|
|
|
|
|
"github.com/siyuan-note/filelock"
|
|
|
|
|
|
"github.com/siyuan-note/logging"
|
2023-08-15 17:04:34 +08:00
|
|
|
|
"github.com/siyuan-note/siyuan/kernel/search"
|
2023-08-04 12:05:29 +08:00
|
|
|
|
"github.com/siyuan-note/siyuan/kernel/sql"
|
|
|
|
|
|
"github.com/siyuan-note/siyuan/kernel/task"
|
|
|
|
|
|
"github.com/siyuan-note/siyuan/kernel/util"
|
2023-08-18 16:20:32 +08:00
|
|
|
|
"github.com/wmentor/epub"
|
2023-08-09 21:14:04 +08:00
|
|
|
|
"github.com/xuri/excelize/v2"
|
2023-08-04 12:05:29 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
2023-08-11 10:19:11 +08:00
|
|
|
|
type AssetContent struct {
|
|
|
|
|
|
ID string `json:"id"`
|
|
|
|
|
|
Name string `json:"name"`
|
|
|
|
|
|
Ext string `json:"ext"`
|
|
|
|
|
|
Path string `json:"path"`
|
|
|
|
|
|
Size int64 `json:"size"`
|
2023-08-15 09:34:58 +08:00
|
|
|
|
HSize string `json:"hSize"`
|
2023-08-11 10:19:11 +08:00
|
|
|
|
Updated int64 `json:"updated"`
|
|
|
|
|
|
Content string `json:"content"`
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 22:48:46 +08:00
|
|
|
|
func GetAssetContent(id, query string, queryMethod int) (ret *AssetContent) {
|
|
|
|
|
|
if "" != query && (0 == queryMethod || 1 == queryMethod) {
|
|
|
|
|
|
if 0 == queryMethod {
|
|
|
|
|
|
query = stringQuery(query)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
table := "asset_contents_fts_case_insensitive"
|
|
|
|
|
|
filter := " id = '" + id + "'"
|
|
|
|
|
|
if "" != query {
|
|
|
|
|
|
filter += " AND `" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
projections := "id, name, ext, path, size, updated, " +
|
2023-08-15 17:04:34 +08:00
|
|
|
|
"highlight(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "') AS content"
|
2023-08-12 22:48:46 +08:00
|
|
|
|
stmt := "SELECT " + projections + " FROM " + table + " WHERE " + filter
|
|
|
|
|
|
assetContents := sql.SelectAssetContentsRawStmt(stmt, 1, 1)
|
|
|
|
|
|
results := fromSQLAssetContents(&assetContents, 36)
|
|
|
|
|
|
if 1 > len(results) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
ret = results[0]
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 00:01:31 +08:00
|
|
|
|
// FullTextSearchAssetContent 搜索资源文件内容。
|
|
|
|
|
|
//
|
|
|
|
|
|
// method:0:关键字,1:查询语法,2:SQL,3:正则表达式
|
2023-08-15 16:32:13 +08:00
|
|
|
|
// orderBy: 0:按相关度降序,1:按相关度升序,2:按更新时间升序,3:按更新时间降序
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func FullTextSearchAssetContent(query string, types map[string]bool, method, orderBy, page, pageSize int) (ret []*AssetContent, matchedAssetCount, pageCount int) {
|
2023-08-11 00:01:31 +08:00
|
|
|
|
query = strings.TrimSpace(query)
|
|
|
|
|
|
beforeLen := 36
|
|
|
|
|
|
orderByClause := buildAssetContentOrderBy(orderBy)
|
|
|
|
|
|
switch method {
|
|
|
|
|
|
case 1: // 查询语法
|
|
|
|
|
|
filter := buildAssetContentTypeFilter(types)
|
2023-08-11 10:28:32 +08:00
|
|
|
|
ret, matchedAssetCount = fullTextSearchAssetContentByQuerySyntax(query, filter, orderByClause, beforeLen, page, pageSize)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
case 2: // SQL
|
2023-08-11 10:28:32 +08:00
|
|
|
|
ret, matchedAssetCount = searchAssetContentBySQL(query, beforeLen, page, pageSize)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
case 3: // 正则表达式
|
|
|
|
|
|
typeFilter := buildAssetContentTypeFilter(types)
|
2023-08-11 10:28:32 +08:00
|
|
|
|
ret, matchedAssetCount = fullTextSearchAssetContentByRegexp(query, typeFilter, orderByClause, beforeLen, page, pageSize)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
default: // 关键字
|
|
|
|
|
|
filter := buildAssetContentTypeFilter(types)
|
2023-08-11 10:28:32 +08:00
|
|
|
|
ret, matchedAssetCount = fullTextSearchAssetContentByKeyword(query, filter, orderByClause, beforeLen, page, pageSize)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
}
|
2023-08-11 10:28:32 +08:00
|
|
|
|
pageCount = (matchedAssetCount + pageSize - 1) / pageSize
|
2023-08-11 00:01:31 +08:00
|
|
|
|
|
|
|
|
|
|
if 1 > len(ret) {
|
2023-08-11 10:19:11 +08:00
|
|
|
|
ret = []*AssetContent{}
|
2023-08-11 00:01:31 +08:00
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func fullTextSearchAssetContentByQuerySyntax(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
|
2023-08-11 00:01:31 +08:00
|
|
|
|
query = gulu.Str.RemoveInvisible(query)
|
|
|
|
|
|
return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func fullTextSearchAssetContentByKeyword(query, typeFilter string, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
|
2023-08-11 00:01:31 +08:00
|
|
|
|
query = gulu.Str.RemoveInvisible(query)
|
|
|
|
|
|
query = stringQuery(query)
|
|
|
|
|
|
return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func fullTextSearchAssetContentByRegexp(exp, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
|
2023-08-11 00:01:31 +08:00
|
|
|
|
exp = gulu.Str.RemoveInvisible(exp)
|
|
|
|
|
|
fieldFilter := assetContentFieldRegexp(exp)
|
|
|
|
|
|
stmt := "SELECT * FROM `asset_contents_fts_case_insensitive` WHERE " + fieldFilter + " AND ext IN " + typeFilter
|
|
|
|
|
|
stmt += " " + orderBy
|
|
|
|
|
|
stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
|
2023-08-11 10:19:11 +08:00
|
|
|
|
assetContents := sql.SelectAssetContentsRawStmtNoParse(stmt, Conf.Search.Limit)
|
|
|
|
|
|
ret = fromSQLAssetContents(&assetContents, beforeLen)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
if 1 > len(ret) {
|
2023-08-11 10:19:11 +08:00
|
|
|
|
ret = []*AssetContent{}
|
2023-08-11 00:01:31 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
matchedAssetCount = fullTextSearchAssetContentCountByRegexp(exp, typeFilter)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func assetContentFieldRegexp(exp string) string {
|
|
|
|
|
|
buf := bytes.Buffer{}
|
|
|
|
|
|
buf.WriteString("(name REGEXP '")
|
|
|
|
|
|
buf.WriteString(exp)
|
|
|
|
|
|
buf.WriteString("' OR content REGEXP '")
|
|
|
|
|
|
buf.WriteString(exp)
|
|
|
|
|
|
buf.WriteString("')")
|
|
|
|
|
|
return buf.String()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func fullTextSearchAssetContentCountByRegexp(exp, typeFilter string) (matchedAssetCount int) {
|
2023-08-11 10:19:11 +08:00
|
|
|
|
table := "asset_contents_fts_case_insensitive"
|
2023-08-15 20:38:49 +08:00
|
|
|
|
fieldFilter := assetContentFieldRegexp(exp)
|
2023-08-11 11:04:47 +08:00
|
|
|
|
stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE " + fieldFilter + " AND ext IN " + typeFilter
|
|
|
|
|
|
result, _ := sql.QueryAssetContentNoLimit(stmt)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
if 1 > len(result) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
2023-08-11 10:28:32 +08:00
|
|
|
|
matchedAssetCount = int(result[0]["assets"].(int64))
|
2023-08-11 00:01:31 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func fullTextSearchAssetContentByFTS(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
|
2023-08-11 00:01:31 +08:00
|
|
|
|
table := "asset_contents_fts_case_insensitive"
|
|
|
|
|
|
projections := "id, name, ext, path, size, updated, " +
|
2023-08-15 17:04:34 +08:00
|
|
|
|
"snippet(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "', '...', 64) AS content"
|
2023-08-11 00:01:31 +08:00
|
|
|
|
stmt := "SELECT " + projections + " FROM " + table + " WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
|
2023-08-11 11:04:47 +08:00
|
|
|
|
stmt += ") AND ext IN " + typeFilter
|
2023-08-11 00:01:31 +08:00
|
|
|
|
stmt += " " + orderBy
|
|
|
|
|
|
stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
|
2023-08-11 10:19:11 +08:00
|
|
|
|
assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
|
|
|
|
|
|
ret = fromSQLAssetContents(&assetContents, beforeLen)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
if 1 > len(ret) {
|
2023-08-11 10:19:11 +08:00
|
|
|
|
ret = []*AssetContent{}
|
2023-08-11 00:01:31 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
matchedAssetCount = fullTextSearchAssetContentCount(query, typeFilter)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func searchAssetContentBySQL(stmt string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
|
2023-08-11 00:01:31 +08:00
|
|
|
|
stmt = gulu.Str.RemoveInvisible(stmt)
|
|
|
|
|
|
stmt = strings.TrimSpace(stmt)
|
2023-08-11 10:19:11 +08:00
|
|
|
|
assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
|
|
|
|
|
|
ret = fromSQLAssetContents(&assetContents, beforeLen)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
if 1 > len(ret) {
|
2023-08-11 10:19:11 +08:00
|
|
|
|
ret = []*AssetContent{}
|
2023-08-11 00:01:31 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
stmt = strings.ToLower(stmt)
|
|
|
|
|
|
stmt = strings.ReplaceAll(stmt, "select * ", "select COUNT(path) AS `assets` ")
|
|
|
|
|
|
stmt = removeLimitClause(stmt)
|
2023-08-11 11:04:47 +08:00
|
|
|
|
result, _ := sql.QueryAssetContentNoLimit(stmt)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
if 1 > len(ret) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
matchedAssetCount = int(result[0]["assets"].(int64))
|
2023-08-11 00:01:31 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:28:32 +08:00
|
|
|
|
func fullTextSearchAssetContentCount(query, typeFilter string) (matchedAssetCount int) {
|
2023-08-11 00:01:31 +08:00
|
|
|
|
query = gulu.Str.RemoveInvisible(query)
|
|
|
|
|
|
|
|
|
|
|
|
table := "asset_contents_fts_case_insensitive"
|
|
|
|
|
|
stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
|
2023-08-11 11:04:47 +08:00
|
|
|
|
stmt += ") AND ext IN " + typeFilter
|
|
|
|
|
|
result, _ := sql.QueryAssetContentNoLimit(stmt)
|
2023-08-11 00:01:31 +08:00
|
|
|
|
if 1 > len(result) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
2023-08-11 10:28:32 +08:00
|
|
|
|
matchedAssetCount = int(result[0]["assets"].(int64))
|
2023-08-11 00:01:31 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:19:11 +08:00
|
|
|
|
func fromSQLAssetContents(assetContents *[]*sql.AssetContent, beforeLen int) (ret []*AssetContent) {
|
|
|
|
|
|
ret = []*AssetContent{}
|
|
|
|
|
|
for _, assetContent := range *assetContents {
|
|
|
|
|
|
ret = append(ret, fromSQLAssetContent(assetContent, beforeLen))
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func fromSQLAssetContent(assetContent *sql.AssetContent, beforeLen int) *AssetContent {
|
2023-08-15 17:04:34 +08:00
|
|
|
|
content := util.EscapeHTML(assetContent.Content)
|
|
|
|
|
|
if strings.Contains(content, search.SearchMarkLeft) {
|
|
|
|
|
|
content = strings.ReplaceAll(content, search.SearchMarkLeft, "<mark>")
|
|
|
|
|
|
content = strings.ReplaceAll(content, search.SearchMarkRight, "</mark>")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 10:19:11 +08:00
|
|
|
|
return &AssetContent{
|
|
|
|
|
|
ID: assetContent.ID,
|
|
|
|
|
|
Name: assetContent.Name,
|
|
|
|
|
|
Ext: assetContent.Ext,
|
|
|
|
|
|
Path: assetContent.Path,
|
|
|
|
|
|
Size: assetContent.Size,
|
2023-08-15 09:34:58 +08:00
|
|
|
|
HSize: humanize.Bytes(uint64(assetContent.Size)),
|
2023-08-11 10:19:11 +08:00
|
|
|
|
Updated: assetContent.Updated,
|
2023-08-15 17:04:34 +08:00
|
|
|
|
Content: content,
|
2023-08-11 10:19:11 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 00:01:31 +08:00
|
|
|
|
func buildAssetContentColumnFilter() string {
|
|
|
|
|
|
return "{name content}"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func buildAssetContentTypeFilter(types map[string]bool) string {
|
|
|
|
|
|
if 0 == len(types) {
|
|
|
|
|
|
return ""
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var buf bytes.Buffer
|
|
|
|
|
|
buf.WriteString("(")
|
2023-08-15 16:32:13 +08:00
|
|
|
|
for k, enabled := range types {
|
|
|
|
|
|
if !enabled {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 00:01:31 +08:00
|
|
|
|
buf.WriteString("'")
|
|
|
|
|
|
buf.WriteString(k)
|
|
|
|
|
|
buf.WriteString("',")
|
|
|
|
|
|
}
|
2023-08-15 16:32:13 +08:00
|
|
|
|
if 1 == buf.Len() {
|
|
|
|
|
|
buf.WriteString(")")
|
|
|
|
|
|
return buf.String()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-11 00:01:31 +08:00
|
|
|
|
buf.Truncate(buf.Len() - 1)
|
|
|
|
|
|
buf.WriteString(")")
|
|
|
|
|
|
return buf.String()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func buildAssetContentOrderBy(orderBy int) string {
|
|
|
|
|
|
switch orderBy {
|
|
|
|
|
|
case 0:
|
|
|
|
|
|
return "ORDER BY rank DESC"
|
|
|
|
|
|
case 1:
|
2023-08-15 16:32:13 +08:00
|
|
|
|
return "ORDER BY rank ASC"
|
2023-08-11 00:01:31 +08:00
|
|
|
|
case 2:
|
2023-08-15 16:32:13 +08:00
|
|
|
|
return "ORDER BY updated ASC"
|
|
|
|
|
|
case 3:
|
2023-08-11 00:01:31 +08:00
|
|
|
|
return "ORDER BY updated DESC"
|
|
|
|
|
|
default:
|
|
|
|
|
|
return "ORDER BY rank DESC"
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-04 12:29:32 +08:00
|
|
|
|
var assetContentSearcher = NewAssetsSearcher()
|
|
|
|
|
|
|
|
|
|
|
|
func IndexAssetContent(absPath string) {
|
2023-08-15 20:16:04 +08:00
|
|
|
|
defer logging.Recover()
|
|
|
|
|
|
|
2023-08-04 12:29:32 +08:00
|
|
|
|
assetsDir := util.GetDataAssetsAbsPath()
|
|
|
|
|
|
|
2023-08-17 18:00:46 +08:00
|
|
|
|
ext := filepath.Ext(absPath)
|
2023-08-17 12:14:19 +08:00
|
|
|
|
parser := assetContentSearcher.GetParser(ext)
|
|
|
|
|
|
if nil == parser {
|
2023-08-04 12:29:32 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
result := parser.Parse(absPath)
|
|
|
|
|
|
if nil == result {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
info, err := os.Stat(absPath)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("stat [%s] failed: %s", absPath, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
|
|
|
|
|
|
|
|
|
|
|
|
assetContents := []*sql.AssetContent{
|
|
|
|
|
|
{
|
|
|
|
|
|
ID: ast.NewNodeID(),
|
2023-08-11 00:01:31 +08:00
|
|
|
|
Name: util.RemoveID(filepath.Base(p)),
|
|
|
|
|
|
Ext: ext,
|
2023-08-04 12:29:32 +08:00
|
|
|
|
Path: p,
|
|
|
|
|
|
Size: info.Size(),
|
|
|
|
|
|
Updated: info.ModTime().Unix(),
|
|
|
|
|
|
Content: result.Content,
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sql.DeleteAssetContentsByPathQueue(p)
|
|
|
|
|
|
sql.IndexAssetContentsQueue(assetContents)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-04 12:05:29 +08:00
|
|
|
|
func ReindexAssetContent() {
|
|
|
|
|
|
task.AppendTask(task.AssetContentDatabaseIndexFull, fullReindexAssetContent)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func fullReindexAssetContent() {
|
|
|
|
|
|
util.PushMsg(Conf.Language(216), 7*1000)
|
|
|
|
|
|
sql.InitAssetContentDatabase(true)
|
|
|
|
|
|
|
2023-08-04 12:29:32 +08:00
|
|
|
|
assetContentSearcher.FullIndex()
|
2023-08-04 12:05:29 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
|
|
subscribeSQLAssetContentEvents()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func subscribeSQLAssetContentEvents() {
|
|
|
|
|
|
eventbus.Subscribe(util.EvtSQLAssetContentRebuild, func() {
|
|
|
|
|
|
ReindexAssetContent()
|
|
|
|
|
|
})
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var (
|
|
|
|
|
|
AssetsSearchEnabled = true
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
type AssetsSearcher struct {
|
2023-08-17 12:14:19 +08:00
|
|
|
|
parsers map[string]AssetParser
|
|
|
|
|
|
lock *sync.Mutex
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (searcher *AssetsSearcher) GetParser(ext string) AssetParser {
|
|
|
|
|
|
searcher.lock.Lock()
|
|
|
|
|
|
defer searcher.lock.Unlock()
|
2023-08-04 12:29:32 +08:00
|
|
|
|
|
2023-08-17 18:00:46 +08:00
|
|
|
|
return searcher.parsers[strings.ToLower(ext)]
|
2023-08-04 12:05:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-04 12:29:32 +08:00
|
|
|
|
func (searcher *AssetsSearcher) FullIndex() {
|
2023-08-15 20:16:04 +08:00
|
|
|
|
defer logging.Recover()
|
|
|
|
|
|
|
2023-08-04 12:29:32 +08:00
|
|
|
|
assetsDir := util.GetDataAssetsAbsPath()
|
2023-08-04 12:05:29 +08:00
|
|
|
|
if !gulu.File.IsDir(assetsDir) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var results []*AssetParseResult
|
|
|
|
|
|
filepath.Walk(assetsDir, func(absPath string, info fs.FileInfo, err error) error {
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("walk dir [%s] failed: %s", absPath, err)
|
|
|
|
|
|
return err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if info.IsDir() {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-17 18:00:46 +08:00
|
|
|
|
ext := filepath.Ext(absPath)
|
2023-08-17 12:14:19 +08:00
|
|
|
|
parser := searcher.GetParser(ext)
|
|
|
|
|
|
if nil == parser {
|
2023-08-04 12:05:29 +08:00
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
result := parser.Parse(absPath)
|
|
|
|
|
|
if nil == result {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
result.Path = "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
|
|
|
|
|
|
result.Size = info.Size()
|
|
|
|
|
|
result.Updated = info.ModTime().Unix()
|
|
|
|
|
|
results = append(results, result)
|
|
|
|
|
|
return nil
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
var assetContents []*sql.AssetContent
|
|
|
|
|
|
for _, result := range results {
|
|
|
|
|
|
assetContents = append(assetContents, &sql.AssetContent{
|
|
|
|
|
|
ID: ast.NewNodeID(),
|
2023-08-11 00:01:31 +08:00
|
|
|
|
Name: util.RemoveID(filepath.Base(result.Path)),
|
|
|
|
|
|
Ext: strings.ToLower(filepath.Ext(result.Path)),
|
2023-08-04 12:05:29 +08:00
|
|
|
|
Path: result.Path,
|
|
|
|
|
|
Size: result.Size,
|
|
|
|
|
|
Updated: result.Updated,
|
|
|
|
|
|
Content: result.Content,
|
|
|
|
|
|
})
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sql.IndexAssetContentsQueue(assetContents)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func NewAssetsSearcher() *AssetsSearcher {
|
2023-08-17 12:14:19 +08:00
|
|
|
|
txtAssetParser := &TxtAssetParser{}
|
2023-08-04 12:05:29 +08:00
|
|
|
|
return &AssetsSearcher{
|
2023-08-17 12:14:19 +08:00
|
|
|
|
parsers: map[string]AssetParser{
|
|
|
|
|
|
".txt": txtAssetParser,
|
|
|
|
|
|
".md": txtAssetParser,
|
|
|
|
|
|
".markdown": txtAssetParser,
|
|
|
|
|
|
".json": txtAssetParser,
|
|
|
|
|
|
".log": txtAssetParser,
|
|
|
|
|
|
".sql": txtAssetParser,
|
|
|
|
|
|
".html": txtAssetParser,
|
|
|
|
|
|
".xml": txtAssetParser,
|
|
|
|
|
|
".java": txtAssetParser,
|
|
|
|
|
|
".h": txtAssetParser,
|
|
|
|
|
|
".c": txtAssetParser,
|
|
|
|
|
|
".cpp": txtAssetParser,
|
|
|
|
|
|
".go": txtAssetParser,
|
2023-08-17 18:00:46 +08:00
|
|
|
|
".rs": txtAssetParser,
|
2023-08-17 12:14:19 +08:00
|
|
|
|
".swift": txtAssetParser,
|
|
|
|
|
|
".kt": txtAssetParser,
|
|
|
|
|
|
".py": txtAssetParser,
|
2023-08-17 18:00:46 +08:00
|
|
|
|
".php": txtAssetParser,
|
2023-08-17 12:14:19 +08:00
|
|
|
|
".js": txtAssetParser,
|
|
|
|
|
|
".css": txtAssetParser,
|
|
|
|
|
|
".ts": txtAssetParser,
|
|
|
|
|
|
".sh": txtAssetParser,
|
|
|
|
|
|
".bat": txtAssetParser,
|
|
|
|
|
|
".cmd": txtAssetParser,
|
|
|
|
|
|
".ini": txtAssetParser,
|
|
|
|
|
|
".yaml": txtAssetParser,
|
|
|
|
|
|
".rst": txtAssetParser,
|
|
|
|
|
|
".adoc": txtAssetParser,
|
|
|
|
|
|
".textile": txtAssetParser,
|
|
|
|
|
|
".opml": txtAssetParser,
|
|
|
|
|
|
".org": txtAssetParser,
|
|
|
|
|
|
".wiki": txtAssetParser,
|
2023-08-09 21:14:04 +08:00
|
|
|
|
".docx": &DocxAssetParser{},
|
|
|
|
|
|
".pptx": &PptxAssetParser{},
|
|
|
|
|
|
".xlsx": &XlsxAssetParser{},
|
2023-08-17 05:52:59 +02:00
|
|
|
|
".pdf": &PdfAssetParser{},
|
2023-08-18 16:20:32 +08:00
|
|
|
|
".epub": &EpubAssetParser{},
|
2023-08-04 12:05:29 +08:00
|
|
|
|
},
|
2023-08-04 12:29:32 +08:00
|
|
|
|
|
|
|
|
|
|
lock: &sync.Mutex{},
|
2023-08-04 12:05:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-17 18:00:46 +08:00
|
|
|
|
const (
|
|
|
|
|
|
TxtAssetContentMaxSize = 1024 * 1024 * 4
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2023-08-04 12:05:29 +08:00
|
|
|
|
type AssetParseResult struct {
|
|
|
|
|
|
Path string
|
|
|
|
|
|
Size int64
|
|
|
|
|
|
Updated int64
|
|
|
|
|
|
Content string
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type AssetParser interface {
|
|
|
|
|
|
Parse(absPath string) *AssetParseResult
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type TxtAssetParser struct {
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
2023-08-17 18:00:46 +08:00
|
|
|
|
info, err := os.Stat(absPath)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("stat file [%s] failed: %s", absPath, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if TxtAssetContentMaxSize < info.Size() {
|
|
|
|
|
|
logging.LogWarnf("file [%s] is too large [%s]", absPath, humanize.Bytes(uint64(info.Size())))
|
2023-08-04 12:05:29 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
data, err := filelock.ReadFile(absPath)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("read file [%s] failed: %s", absPath, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-05 18:47:52 +08:00
|
|
|
|
content := normalizeAssetContent(string(data))
|
2023-08-04 12:05:29 +08:00
|
|
|
|
ret = &AssetParseResult{
|
2023-08-05 18:47:52 +08:00
|
|
|
|
Content: content,
|
2023-08-04 12:05:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
2023-08-05 18:47:52 +08:00
|
|
|
|
|
|
|
|
|
|
func normalizeAssetContent(content string) (ret string) {
|
|
|
|
|
|
ret = strings.Join(strings.Fields(content), " ")
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
2023-08-09 21:14:04 +08:00
|
|
|
|
|
|
|
|
|
|
func copyTempAsset(absPath string) (ret string) {
|
|
|
|
|
|
dir := filepath.Join(util.TempDir, "convert", "asset_content")
|
|
|
|
|
|
if err := os.MkdirAll(dir, 0755); nil != err {
|
|
|
|
|
|
logging.LogErrorf("mkdir [%s] failed: [%s]", dir, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-15 17:04:34 +08:00
|
|
|
|
baseName := filepath.Base(absPath)
|
|
|
|
|
|
if strings.HasPrefix(baseName, "~") {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-15 16:18:43 +08:00
|
|
|
|
filelock.RWLock.Lock()
|
|
|
|
|
|
defer filelock.RWLock.Unlock()
|
|
|
|
|
|
|
2023-08-09 21:14:04 +08:00
|
|
|
|
ret = filepath.Join(dir, gulu.Rand.String(7)+".docx")
|
2023-08-15 16:18:43 +08:00
|
|
|
|
if err := gulu.File.Copy(absPath, ret); nil != err {
|
|
|
|
|
|
logging.LogErrorf("copy [src=%s, dest=%s] failed: %s", absPath, ret, err)
|
2023-08-09 21:14:04 +08:00
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type DocxAssetParser struct {
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (parser *DocxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|
|
|
|
|
if !strings.HasSuffix(strings.ToLower(absPath), ".docx") {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if !gulu.File.IsExist(absPath) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
tmp := copyTempAsset(absPath)
|
|
|
|
|
|
if "" == tmp {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer os.RemoveAll(tmp)
|
|
|
|
|
|
|
|
|
|
|
|
f, err := os.Open(tmp)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
|
|
|
|
data, _, err := docconv.ConvertDocx(f)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var content = normalizeAssetContent(data)
|
|
|
|
|
|
ret = &AssetParseResult{
|
|
|
|
|
|
Content: content,
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type PptxAssetParser struct {
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (parser *PptxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|
|
|
|
|
if !strings.HasSuffix(strings.ToLower(absPath), ".pptx") {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if !gulu.File.IsExist(absPath) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
tmp := copyTempAsset(absPath)
|
|
|
|
|
|
if "" == tmp {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer os.RemoveAll(tmp)
|
|
|
|
|
|
|
|
|
|
|
|
f, err := os.Open(tmp)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
|
|
|
|
data, _, err := docconv.ConvertPptx(f)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var content = normalizeAssetContent(data)
|
|
|
|
|
|
ret = &AssetParseResult{
|
|
|
|
|
|
Content: content,
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type XlsxAssetParser struct {
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|
|
|
|
|
if !strings.HasSuffix(strings.ToLower(absPath), ".xlsx") {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if !gulu.File.IsExist(absPath) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
tmp := copyTempAsset(absPath)
|
|
|
|
|
|
if "" == tmp {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer os.RemoveAll(tmp)
|
|
|
|
|
|
|
|
|
|
|
|
x, err := excelize.OpenFile(tmp)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer x.Close()
|
|
|
|
|
|
|
|
|
|
|
|
buf := bytes.Buffer{}
|
|
|
|
|
|
sheetMap := x.GetSheetMap()
|
|
|
|
|
|
for _, sheetName := range sheetMap {
|
|
|
|
|
|
rows, getErr := x.GetRows(sheetName)
|
|
|
|
|
|
if nil != getErr {
|
|
|
|
|
|
logging.LogErrorf("get rows from sheet [%s] failed: [%s]", sheetName, getErr)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
for _, row := range rows {
|
|
|
|
|
|
for _, colCell := range row {
|
|
|
|
|
|
buf.WriteString(colCell + " ")
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var content = normalizeAssetContent(buf.String())
|
|
|
|
|
|
ret = &AssetParseResult{
|
|
|
|
|
|
Content: content,
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
2023-08-17 05:52:59 +02:00
|
|
|
|
|
|
|
|
|
|
// PdfAssetParser parser factory product
|
|
|
|
|
|
type PdfAssetParser struct {
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Parse will parse a PDF document using PDFium webassembly module
|
|
|
|
|
|
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|
|
|
|
|
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if !gulu.File.IsExist(absPath) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
tmp := copyTempAsset(absPath)
|
|
|
|
|
|
if "" == tmp {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer os.RemoveAll(tmp)
|
|
|
|
|
|
|
|
|
|
|
|
f, err := os.Open(tmp)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
|
|
|
|
stat, err := f.Stat()
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// initialize pdfium with one worker
|
|
|
|
|
|
pool, err := webassembly.Init(webassembly.Config{
|
|
|
|
|
|
MinIdle: 1,
|
|
|
|
|
|
MaxIdle: 1,
|
|
|
|
|
|
MaxTotal: 1,
|
|
|
|
|
|
})
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer pool.Close()
|
|
|
|
|
|
|
|
|
|
|
|
instance, err := pool.GetInstance(time.Second * 30)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer instance.Close()
|
|
|
|
|
|
|
|
|
|
|
|
// get number of pages inside PDF document
|
|
|
|
|
|
doc, err := instance.OpenDocument(&requests.OpenDocument{
|
|
|
|
|
|
FileReader: f,
|
|
|
|
|
|
FileReaderSize: stat.Size(),
|
|
|
|
|
|
})
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
|
|
|
|
|
Document: doc.Document,
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
// loop through pages and get content
|
|
|
|
|
|
content := ""
|
|
|
|
|
|
for page := 0; page < pageCount.PageCount; page++ {
|
|
|
|
|
|
req := &requests.GetPageText{
|
|
|
|
|
|
Page: requests.Page{
|
|
|
|
|
|
ByIndex: &requests.PageByIndex{
|
|
|
|
|
|
Document: doc.Document,
|
|
|
|
|
|
Index: page,
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
pt, err := instance.GetPageText(req)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
content += " " + normalizeAssetContent(pt.Text)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ret = &AssetParseResult{
|
|
|
|
|
|
Content: content,
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
2023-08-18 16:20:32 +08:00
|
|
|
|
|
|
|
|
|
|
type EpubAssetParser struct {
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (parser *EpubAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|
|
|
|
|
if !strings.HasSuffix(strings.ToLower(absPath), ".epub") {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if !gulu.File.IsExist(absPath) {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
tmp := copyTempAsset(absPath)
|
|
|
|
|
|
if "" == tmp {
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer os.RemoveAll(tmp)
|
|
|
|
|
|
|
|
|
|
|
|
f, err := os.Open(tmp)
|
|
|
|
|
|
if nil != err {
|
|
|
|
|
|
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
|
|
|
|
buf := bytes.Buffer{}
|
|
|
|
|
|
if err = epub.ToTxt(tmp, &buf); nil != err {
|
|
|
|
|
|
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
content := normalizeAssetContent(buf.String())
|
|
|
|
|
|
ret = &AssetParseResult{
|
|
|
|
|
|
Content: content,
|
|
|
|
|
|
}
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|