siyuan/kernel/search/find.go

// SiYuan - Refactor your thinking
// Copyright (c) 2020-present, b3log.org
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.

package search

import (
	"bytes"
	"io"
	"os"
	"path/filepath"
	"runtime"
	"sync"
)

type Match struct {
	Path   string
	Target string
}

func FindAllMatchedPaths(root string, targets []string) []string {
	matches := FindAllMatches(root, targets)
	return pathsFromMatches(matches)
}

func FindAllMatchedTargets(root string, targets []string) []string {
	matches := FindAllMatches(root, targets)
	return targetsFromMatches(matches)
}

// FindAllMatches 遍历 root 下的文件，返回所有命中的结果（文件路径 + 命中目标）
// targets 为空或 root 为空时返回 nil
func FindAllMatches(root string, targets []string) []Match {
	if root == "" || len(targets) == 0 {
		return nil
	}

	// 构建基于首字节的模式索引，并计算最长模式长度
	patternIndex := make(map[byte][][]byte)
	var maxLen int
	for _, t := range targets {
		if t == "" {
			continue
		}
		b := []byte(t)
		if len(b) > maxLen {
			maxLen = len(b)
		}
		patternIndex[b[0]] = append(patternIndex[b[0]], b)
	}
	if len(patternIndex) == 0 {
		return nil
	}

	jobs := make(chan string, 256)
	results := make(chan Match, 256)

	var wg sync.WaitGroup
	var collectWg sync.WaitGroup

	var matches []Match
	collectWg.Add(1)
	go func() {
		defer collectWg.Done()
		for m := range results {
			matches = append(matches, m)
		}
	}()

	numWorkers := runtime.NumCPU()
	for i := 0; i < numWorkers; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			for p := range jobs {
				hits := scanFileForTargets(p, patternIndex, maxLen)
				if len(hits) > 0 {
					for _, t := range hits {
						results <- Match{Path: p, Target: t}
					}
				}
			}
		}()
	}

	_ = filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error {
		if err == nil && d.Type().IsRegular() {
			jobs <- path
		}
		return nil
	})

	close(jobs)
	wg.Wait()
	close(results)
	collectWg.Wait()
	return matches
}

// scanFileForTargets 在文件中流式搜索所有目标（基于首字节索引），返回去重后的命中目标字符串列表
func scanFileForTargets(path string, patternIndex map[byte][][]byte, maxLen int) []string {
	f, err := os.Open(path)
	if err != nil {
		return nil
	}
	defer f.Close()

	// 构建字节位图，加速首字节检测
	var bitmap [256]bool
	for b := range patternIndex {
		bitmap[b] = true
	}

	found := make(map[string]struct{})
	buf := make([]byte, 64<<10) // 64KB
	// 保留上一次块末尾的重叠数据以支持跨块匹配
	var tail []byte

	for {
		n, err := f.Read(buf)
		if n > 0 {
			// data = tail + buf[:n]
			data := make([]byte, len(tail)+n)
			copy(data, tail)
			copy(data[len(tail):], buf[:n])

			// 扫描 data，查找任意候选首字节位置
			i := 0
			for i < len(data) {
				// 快速跳过非候选字节
				for i < len(data) && !bitmap[data[i]] {
					i++
				}
				if i >= len(data) {
					break
				}
				b := data[i]
				// 对应首字节的所有模式进行校验
				for _, pat := range patternIndex[b] {
					pl := len(pat)
					// 如果剩余字节不足以完全匹配，则交由下一轮（通过 tail 保证）
					if i+pl <= len(data) {
						if bytes.Equal(pat, data[i:i+pl]) {
							found[string(pat)] = struct{}{}
						}
					}
				}
				i++
			}

			// 保留最后 maxLen-1 字节作为下一块的 tail（避免超长内存分配）
			if maxLen <= 1 {
				tail = nil
			} else {
				if len(data) >= maxLen-1 {
					tail = append(tail[:0], data[len(data)-(maxLen-1):]...)
				} else {
					tail = append(tail[:0], data...)
				}
			}
		}
		if err != nil {
			if err == io.EOF {
				break
			}
			// 读取出错，返回已有结果
			break
		}
	}

	if len(found) == 0 {
		return nil
	}
	res := make([]string, 0, len(found))
	for k := range found {
		res = append(res, k)
	}
	return res
}

// pathsFromMatches 从 Match 列表中返回去重的路径切片（保留首次出现顺序）
func pathsFromMatches(ms []Match) []string {
	if len(ms) == 0 {
		return nil
	}
	seen := make(map[string]struct{})
	paths := make([]string, 0)
	for _, m := range ms {
		if _, ok := seen[m.Path]; ok {
			continue
		}
		seen[m.Path] = struct{}{}
		paths = append(paths, m.Path)
	}
	return paths
}

// targetsFromMatches 从 Match 列表中返回去重的目标切片（保留首次出现顺序）
func targetsFromMatches(ms []Match) []string {
	if len(ms) == 0 {
		return nil
	}
	seen := make(map[string]struct{})
	targets := make([]string, 0)
	for _, m := range ms {
		if _, ok := seen[m.Target]; ok {
			continue
		}
		seen[m.Target] = struct{}{}
		targets = append(targets, m.Target)
	}
	return targets
}