From c0bd645048a03f188dbb9853741f3c2b9a5358f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B4=AE=E7=94=9F?= Date: Sun, 16 Jun 2024 22:55:22 +0800 Subject: [PATCH] Kernel API OCR returns text coordinate information (#11738) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 优化setImageOCRText接口调用 * 扩展 ocr 接口,添加 ocrJSON 返回字段 * 过滤不可见字符 * 返回的ocr文本添加空格 --- app/src/menus/protyle.ts | 16 +++++---- kernel/api/asset.go | 8 ++--- kernel/model/ocr.go | 2 +- kernel/sql/block.go | 2 +- kernel/util/ocr.go | 71 ++++++++++++++++++++++++++++------------ 5 files changed, 65 insertions(+), 34 deletions(-) diff --git a/app/src/menus/protyle.ts b/app/src/menus/protyle.ts index 1e59bb119..cc174de01 100644 --- a/app/src/menus/protyle.ts +++ b/app/src/menus/protyle.ts @@ -1019,7 +1019,9 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme fetchPost("/api/asset/getImageOCRText", { path: imgElement.getAttribute("src") }, (response) => { - element.querySelector("textarea").value = response.data.text; + const textarea =element.querySelector("textarea") + textarea.value = response.data.text; + textarea.dataset.ocrText = response.data.text; }); } }, { @@ -1031,11 +1033,6 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme fetchPost("/api/asset/ocr", { path: imgElement.getAttribute("src"), force: true - }, (response) => { - fetchPost("/api/asset/setImageOCRText", { - path: imgElement.getAttribute("src"), - text: response.data.text - }); }); } }], @@ -1119,6 +1116,13 @@ export const imgMenu = (protyle: IProtyle, range: Range, assetElement: HTMLEleme const textElements = window.siyuan.menus.menu.element.querySelectorAll("textarea"); textElements[0].focus(); window.siyuan.menus.menu.removeCB = () => { + const ocrElement = window.siyuan.menus.menu.element.querySelector('[data-type="ocr"]') as HTMLTextAreaElement; + if (ocrElement && ocrElement.dataset.ocrText !== ocrElement.value) { + fetchPost("/api/asset/setImageOCRText", { + path: imgElement.getAttribute("src"), + text: ocrElement.value + }); + } imgElement.setAttribute("alt", textElements[2].value.replace(/\n|\r\n|\r|\u2028|\u2029/g, "")); nodeElement.setAttribute("updated", dayjs().format("YYYYMMDDHHmmss")); updateTransaction(protyle, id, nodeElement.outerHTML, html); diff --git a/kernel/api/asset.go b/kernel/api/asset.go index 2b3ad7223..08b11f5e1 100644 --- a/kernel/api/asset.go +++ b/kernel/api/asset.go @@ -137,13 +137,11 @@ func ocr(c *gin.Context) { } path := arg["path"].(string) - force := false - if forceArg := arg["force"]; nil != forceArg { - force = forceArg.(bool) - } + ocrJSON := util.OcrAsset(path) ret.Data = map[string]interface{}{ - "text": util.OcrAsset(path, force), + "text": util.GetOcrJsonText(ocrJSON), + "ocrJSON": ocrJSON, } } diff --git a/kernel/model/ocr.go b/kernel/model/ocr.go index 61e9c92ed..638c5ca6b 100644 --- a/kernel/model/ocr.go +++ b/kernel/model/ocr.go @@ -33,7 +33,7 @@ func autoOCRAssets() { assets := getUnOCRAssetsAbsPaths() if 0 < len(assets) { for i, assetAbsPath := range assets { - text := util.Tesseract(assetAbsPath) + text := util.GetOcrJsonText(util.Tesseract(assetAbsPath)) p := strings.TrimPrefix(assetAbsPath, assetsPath) p = "assets" + filepath.ToSlash(p) util.SetAssetText(p, text) diff --git a/kernel/sql/block.go b/kernel/sql/block.go index f6efa0152..9266d82f0 100644 --- a/kernel/sql/block.go +++ b/kernel/sql/block.go @@ -198,7 +198,7 @@ func nodeStaticContent(node *ast.Node, excludeTypes []string, includeTextMarkATi var linkDestStr, ocrText string if nil != linkDest { linkDestStr = linkDest.TokensStr() - ocrText = util.OcrAsset(linkDestStr, false) + ocrText = util.GetAssetText(linkDestStr) } linkText := n.ChildByType(ast.NodeLinkText) diff --git a/kernel/util/ocr.go b/kernel/util/ocr.go index 3f2e8c3b6..2edb9bb3f 100644 --- a/kernel/util/ocr.go +++ b/kernel/util/ocr.go @@ -149,22 +149,16 @@ func ExistsAssetText(asset string) (ret bool) { return } -func OcrAsset(asset string, force bool) (ret string) { - if !force { - assetsTextsLock.Lock() - ret = assetsTexts[asset] - assetsTextsLock.Unlock() - return - } - +func OcrAsset(asset string) (ret []map[string]interface{}) { assetsPath := GetDataAssetsAbsPath() assetAbsPath := strings.TrimPrefix(asset, "assets") assetAbsPath = filepath.Join(assetsPath, assetAbsPath) ret = Tesseract(assetAbsPath) assetsTextsLock.Lock() - assetsTexts[asset] = ret + ocrText := GetOcrJsonText(ret) + assetsTexts[asset] = ocrText assetsTextsLock.Unlock() - if "" != ret { + if "" != ocrText { assetsTextsChanged.Store(true) } return @@ -184,9 +178,9 @@ func IsTesseractExtractable(p string) bool { // tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265 var tesseractOCRLock = sync.Mutex{} -func Tesseract(imgAbsPath string) string { +func Tesseract(imgAbsPath string) (ret []map[string]interface{}) { if ContainerStd != Container || !TesseractEnabled { - return "" + return } defer logging.Recover() @@ -194,16 +188,16 @@ func Tesseract(imgAbsPath string) string { defer tesseractOCRLock.Unlock() if !IsTesseractExtractable(imgAbsPath) { - return "" + return } info, err := os.Stat(imgAbsPath) if nil != err { - return "" + return } if TesseractMaxSize < uint64(info.Size()) { - return "" + return } defer logging.Recover() @@ -211,24 +205,59 @@ func Tesseract(imgAbsPath string) string { ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second) defer cancel() - cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+")) + cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"), "tsv") gulu.CmdAttr(cmd) output, err := cmd.CombinedOutput() if ctx.Err() == context.DeadlineExceeded { logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size()) - return "" + return } if nil != err { logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err) - return "" + return } - ret := string(output) + tsv := string(output) + + // 按行分割 TSV 数据 + lines := strings.Split(tsv, "\r\n") + + // 解析 TSV 数据 跳过标题行,从第二行开始处理 + for _, line := range lines[1:] { + if line == "" { + continue // 跳过空行 + } + // 分割每列数据 + fields := strings.Split(line, "\t") + // 将字段名和字段值映射到一个 map 中 + dataMap := make(map[string]interface{}) + for i, header := range strings.Split(lines[0], "\t") { + dataMap[header] = fields[i] + } + ret = append(ret, dataMap) + } + + tsv = gulu.Str.RemoveInvisible(tsv) + tsv = RemoveRedundantSpace(tsv) + msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(GetOcrJsonText(ret))) + PushStatusBar(msg) + return +} + +// 提取并连接所有 text 字段的函数 +func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) { + for _, dataMap := range jsonData { + // 检查 text 字段是否存在 + if text, ok := dataMap["text"]; ok { + // 确保 text 是字符串类型 + if textStr, ok := text.(string); ok { + ret += " " + textStr + } + } + } ret = gulu.Str.RemoveInvisible(ret) ret = RemoveRedundantSpace(ret) - msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(ret)) - PushStatusBar(msg) return ret }