| // Copyright 2017 Google Inc. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package finder |
| |
| import ( |
| "bufio" |
| "bytes" |
| "encoding/json" |
| "errors" |
| "fmt" |
| "io" |
| "os" |
| "path/filepath" |
| "runtime" |
| "sort" |
| "strings" |
| "sync" |
| "sync/atomic" |
| "time" |
| |
| "android/soong/finder/fs" |
| ) |
| |
| // This file provides a Finder struct that can quickly search for files satisfying |
| // certain criteria. |
| // This Finder gets its speed partially from parallelism and partially from caching. |
| // If a Stat call returns the same result as last time, then it means Finder |
| // can skip the ReadDir call for that dir. |
| |
| // The primary data structure used by the finder is the field Finder.nodes , |
| // which is a tree of nodes of type *pathMap . |
| // Each node represents a directory on disk, along with its stats, subdirectories, |
| // and contained files. |
| |
| // The common use case for the Finder is that the caller creates a Finder and gives |
| // it the same query that was given to it in the previous execution. |
| // In this situation, the major events that take place are: |
| // 1. The Finder begins to load its db |
| // 2. The Finder begins to stat the directories mentioned in its db (using multiple threads) |
| // Calling Stat on each of these directories is generally a large fraction of the total time |
| // 3. The Finder begins to construct a separate tree of nodes in each of its threads |
| // 4. The Finder merges the individual node trees into the main node tree |
| // 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date |
| // These ReadDir calls might prompt additional Stat calls, etc |
| // 6. The Finder waits for all loading to complete |
| // 7. The Finder searches the cache for files matching the user's query (using multiple threads) |
| |
| // These are the invariants regarding concurrency: |
| // 1. The public methods of Finder are threadsafe. |
| // The public methods are only performance-optimized for one caller at a time, however. |
| // For the moment, multiple concurrent callers shouldn't expect any better performance than |
| // multiple serial callers. |
| // 2. While building the node tree, only one thread may ever access the <children> collection of a |
| // *pathMap at once. |
| // a) The thread that accesses the <children> collection is the thread that discovers the |
| // children (by reading them from the cache or by having received a response to ReadDir). |
| // 1) Consequently, the thread that discovers the children also spawns requests to stat |
| // subdirs. |
| // b) Consequently, while building the node tree, no thread may do a lookup of its |
| // *pathMap via filepath because another thread may be adding children to the |
| // <children> collection of an ancestor node. Additionally, in rare cases, another thread |
| // may be removing children from an ancestor node if the children were only discovered to |
| // be irrelevant after calling ReadDir (which happens if a prune-file was just added). |
| // 3. No query will begin to be serviced until all loading (both reading the db |
| // and scanning the filesystem) is complete. |
| // Tests indicate that it only takes about 10% as long to search the in-memory cache as to |
| // generate it, making this not a huge loss in performance. |
| // 4. The parsing of the db and the initial setup of the pathMap tree must complete before |
| // beginning to call listDirSync (because listDirSync can create new entries in the pathMap) |
| |
| // see cmd/finder.go or finder_test.go for usage examples |
| |
| // Update versionString whenever making a backwards-incompatible change to the cache file format |
| const versionString = "Android finder version 1" |
| |
| // a CacheParams specifies which files and directories the user wishes be scanned and |
| // potentially added to the cache |
| type CacheParams struct { |
| // WorkingDirectory is used as a base for any relative file paths given to the Finder |
| WorkingDirectory string |
| |
| // RootDirs are the root directories used to initiate the search |
| RootDirs []string |
| |
| // Whether symlinks are followed. If set, symlinks back to their own parent |
| // directory don't work. |
| FollowSymlinks bool |
| |
| // ExcludeDirs are directory names that if encountered are removed from the search |
| ExcludeDirs []string |
| |
| // PruneFiles are file names that if encountered prune their entire directory |
| // (including siblings) |
| PruneFiles []string |
| |
| // IncludeFiles are file names to include as matches |
| IncludeFiles []string |
| |
| // IncludeSuffixes are filename suffixes to include as matches. |
| IncludeSuffixes []string |
| } |
| |
| // a cacheConfig stores the inputs that determine what should be included in the cache |
| type cacheConfig struct { |
| CacheParams |
| |
| // FilesystemView is a unique identifier telling which parts of which file systems |
| // are readable by the Finder. In practice its value is essentially username@hostname. |
| // FilesystemView is set to ensure that a cache file copied to another host or |
| // found by another user doesn't inadvertently get reused. |
| FilesystemView string |
| } |
| |
| func (p *cacheConfig) Dump() ([]byte, error) { |
| bytes, err := json.Marshal(p) |
| return bytes, err |
| } |
| |
| // a cacheMetadata stores version information about the cache |
| type cacheMetadata struct { |
| // The Version enables the Finder to determine whether it can even parse the file |
| // If the version changes, the entire cache file must be regenerated |
| Version string |
| |
| // The CacheParams enables the Finder to determine whether the parameters match |
| // If the CacheParams change, the Finder can choose how much of the cache file to reuse |
| // (although in practice, the Finder will probably choose to ignore the entire file anyway) |
| Config cacheConfig |
| } |
| |
| type Logger interface { |
| Output(calldepth int, s string) error |
| } |
| |
| // the Finder is the main struct that callers will want to use |
| type Finder struct { |
| // configuration |
| DbPath string |
| numDbLoadingThreads int |
| numSearchingThreads int |
| cacheMetadata cacheMetadata |
| logger Logger |
| filesystem fs.FileSystem |
| |
| // temporary state |
| threadPool *threadPool |
| mutex sync.Mutex |
| fsErrs []fsErr |
| errlock sync.Mutex |
| shutdownWaitgroup sync.WaitGroup |
| |
| // non-temporary state |
| modifiedFlag int32 |
| nodes pathMap |
| } |
| |
| var defaultNumThreads = runtime.NumCPU() * 2 |
| |
| // New creates a new Finder for use |
| func New(cacheParams CacheParams, filesystem fs.FileSystem, |
| logger Logger, dbPath string) (f *Finder, err error) { |
| return newImpl(cacheParams, filesystem, logger, dbPath, defaultNumThreads) |
| } |
| |
| // newImpl is like New but accepts more params |
| func newImpl(cacheParams CacheParams, filesystem fs.FileSystem, |
| logger Logger, dbPath string, numThreads int) (f *Finder, err error) { |
| numDbLoadingThreads := numThreads |
| numSearchingThreads := numThreads |
| |
| metadata := cacheMetadata{ |
| Version: versionString, |
| Config: cacheConfig{ |
| CacheParams: cacheParams, |
| FilesystemView: filesystem.ViewId(), |
| }, |
| } |
| |
| f = &Finder{ |
| numDbLoadingThreads: numDbLoadingThreads, |
| numSearchingThreads: numSearchingThreads, |
| cacheMetadata: metadata, |
| logger: logger, |
| filesystem: filesystem, |
| |
| nodes: *newPathMap("/"), |
| DbPath: dbPath, |
| |
| shutdownWaitgroup: sync.WaitGroup{}, |
| } |
| |
| f.loadFromFilesystem() |
| |
| // check for any filesystem errors |
| err = f.getErr() |
| if err != nil { |
| return nil, err |
| } |
| |
| // confirm that every path mentioned in the CacheConfig exists |
| for _, path := range cacheParams.RootDirs { |
| if !filepath.IsAbs(path) { |
| path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path) |
| } |
| node := f.nodes.GetNode(filepath.Clean(path), false) |
| if node == nil || node.ModTime == 0 { |
| return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path) |
| } |
| } |
| |
| return f, nil |
| } |
| |
| // FindNamed searches for every cached file |
| func (f *Finder) FindAll() []string { |
| return f.FindAt("/") |
| } |
| |
| // FindNamed searches for every cached file under <rootDir> |
| func (f *Finder) FindAt(rootDir string) []string { |
| filter := func(entries DirEntries) (dirNames []string, fileNames []string) { |
| return entries.DirNames, entries.FileNames |
| } |
| return f.FindMatching(rootDir, filter) |
| } |
| |
| // FindNamed searches for every cached file named <fileName> |
| func (f *Finder) FindNamed(fileName string) []string { |
| return f.FindNamedAt("/", fileName) |
| } |
| |
| // FindNamedAt searches under <rootPath> for every file named <fileName> |
| // The reason a caller might use FindNamedAt instead of FindNamed is if they want |
| // to limit their search to a subset of the cache |
| func (f *Finder) FindNamedAt(rootPath string, fileName string) []string { |
| filter := func(entries DirEntries) (dirNames []string, fileNames []string) { |
| matches := []string{} |
| for _, foundName := range entries.FileNames { |
| if foundName == fileName { |
| matches = append(matches, foundName) |
| } |
| } |
| return entries.DirNames, matches |
| |
| } |
| return f.FindMatching(rootPath, filter) |
| } |
| |
| // FindFirstNamed searches for every file named <fileName> |
| // Whenever it finds a match, it stops search subdirectories |
| func (f *Finder) FindFirstNamed(fileName string) []string { |
| return f.FindFirstNamedAt("/", fileName) |
| } |
| |
| // FindFirstNamedAt searches for every file named <fileName> |
| // Whenever it finds a match, it stops search subdirectories |
| func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string { |
| filter := func(entries DirEntries) (dirNames []string, fileNames []string) { |
| matches := []string{} |
| for _, foundName := range entries.FileNames { |
| if foundName == fileName { |
| matches = append(matches, foundName) |
| } |
| } |
| |
| if len(matches) > 0 { |
| return []string{}, matches |
| } |
| return entries.DirNames, matches |
| } |
| return f.FindMatching(rootPath, filter) |
| } |
| |
| // FindMatching is the most general exported function for searching for files in the cache |
| // The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries |
| // in place, removing file paths and directories as desired. |
| // WalkFunc will be invoked potentially many times in parallel, and must be threadsafe. |
| func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string { |
| // set up some parameters |
| scanStart := time.Now() |
| var isRel bool |
| workingDir := f.cacheMetadata.Config.WorkingDirectory |
| |
| isRel = !filepath.IsAbs(rootPath) |
| if isRel { |
| rootPath = filepath.Join(workingDir, rootPath) |
| } |
| |
| rootPath = filepath.Clean(rootPath) |
| |
| // ensure nothing else is using the Finder |
| f.verbosef("FindMatching waiting for finder to be idle\n") |
| f.lock() |
| defer f.unlock() |
| |
| node := f.nodes.GetNode(rootPath, false) |
| if node == nil { |
| f.verbosef("No data for path %v ; apparently not included in cache params: %v\n", |
| rootPath, f.cacheMetadata.Config.CacheParams) |
| // path is not found; don't do a search |
| return []string{} |
| } |
| |
| // search for matching files |
| f.verbosef("Finder finding %v using cache\n", rootPath) |
| results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads) |
| |
| // format and return results |
| if isRel { |
| for i := 0; i < len(results); i++ { |
| results[i] = strings.Replace(results[i], workingDir+"/", "", 1) |
| } |
| } |
| sort.Strings(results) |
| f.verbosef("Found %v files under %v in %v using cache\n", |
| len(results), rootPath, time.Since(scanStart)) |
| return results |
| } |
| |
| // Shutdown declares that the finder is no longer needed and waits for its cleanup to complete |
| // Currently, that only entails waiting for the database dump to complete. |
| func (f *Finder) Shutdown() { |
| f.WaitForDbDump() |
| } |
| |
| // WaitForDbDump returns once the database has been written to f.DbPath. |
| func (f *Finder) WaitForDbDump() { |
| f.shutdownWaitgroup.Wait() |
| } |
| |
| // End of public api |
| |
| func (f *Finder) goDumpDb() { |
| if f.wasModified() { |
| f.shutdownWaitgroup.Add(1) |
| go func() { |
| err := f.dumpDb() |
| if err != nil { |
| f.verbosef("%v\n", err) |
| } |
| f.shutdownWaitgroup.Done() |
| }() |
| } else { |
| f.verbosef("Skipping dumping unmodified db\n") |
| } |
| } |
| |
| // joinCleanPaths is like filepath.Join but is faster because |
| // joinCleanPaths doesn't have to support paths ending in "/" or containing ".." |
| func joinCleanPaths(base string, leaf string) string { |
| if base == "" { |
| return leaf |
| } |
| if base == "/" { |
| return base + leaf |
| } |
| if leaf == "" { |
| return base |
| } |
| return base + "/" + leaf |
| } |
| |
| func (f *Finder) verbosef(format string, args ...interface{}) { |
| f.logger.Output(2, fmt.Sprintf(format, args...)) |
| } |
| |
| // loadFromFilesystem populates the in-memory cache based on the contents of the filesystem |
| func (f *Finder) loadFromFilesystem() { |
| f.threadPool = newThreadPool(f.numDbLoadingThreads) |
| |
| err := f.startFromExternalCache() |
| if err != nil { |
| f.startWithoutExternalCache() |
| } |
| |
| f.goDumpDb() |
| |
| f.threadPool = nil |
| } |
| |
| func (f *Finder) startFind(path string) { |
| if !filepath.IsAbs(path) { |
| path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path) |
| } |
| node := f.nodes.GetNode(path, true) |
| f.statDirAsync(node) |
| } |
| |
| func (f *Finder) lock() { |
| f.mutex.Lock() |
| } |
| |
| func (f *Finder) unlock() { |
| f.mutex.Unlock() |
| } |
| |
| // a statResponse is the relevant portion of the response from the filesystem to a Stat call |
| type statResponse struct { |
| ModTime int64 |
| Inode uint64 |
| Device uint64 |
| } |
| |
| // a pathAndStats stores a path and its stats |
| type pathAndStats struct { |
| statResponse |
| |
| Path string |
| } |
| |
| // a dirFullInfo stores all of the relevant information we know about a directory |
| type dirFullInfo struct { |
| pathAndStats |
| |
| FileNames []string |
| } |
| |
| // a PersistedDirInfo is the information about a dir that we save to our cache on disk |
| type PersistedDirInfo struct { |
| // These field names are short because they are repeated many times in the output json file |
| P string // path |
| T int64 // modification time |
| I uint64 // inode number |
| F []string // relevant filenames contained |
| } |
| |
| // a PersistedDirs is the information that we persist for a group of dirs |
| type PersistedDirs struct { |
| // the device on which each directory is stored |
| Device uint64 |
| // the common root path to which all contained dirs are relative |
| Root string |
| // the directories themselves |
| Dirs []PersistedDirInfo |
| } |
| |
| // a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time |
| type CacheEntry []PersistedDirs |
| |
| // a DirEntries lists the files and directories contained directly within a specific directory |
| type DirEntries struct { |
| Path string |
| |
| // elements of DirNames are just the dir names; they don't include any '/' character |
| DirNames []string |
| // elements of FileNames are just the file names; they don't include '/' character |
| FileNames []string |
| } |
| |
| // a WalkFunc is the type that is passed into various Find functions for determining which |
| // directories the caller wishes be walked. The WalkFunc is expected to decide which |
| // directories to walk and which files to consider as matches to the original query. |
| type WalkFunc func(DirEntries) (dirs []string, files []string) |
| |
| // a mapNode stores the relevant stats about a directory to be stored in a pathMap |
| type mapNode struct { |
| statResponse |
| FileNames []string |
| } |
| |
| // a pathMap implements the directory tree structure of nodes |
| type pathMap struct { |
| mapNode |
| |
| path string |
| |
| children map[string]*pathMap |
| |
| // number of descendent nodes, including self |
| approximateNumDescendents int |
| } |
| |
| func newPathMap(path string) *pathMap { |
| result := &pathMap{path: path, children: make(map[string]*pathMap, 4), |
| approximateNumDescendents: 1} |
| return result |
| } |
| |
| // GetNode returns the node at <path> |
| func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap { |
| if len(path) > 0 && path[0] == '/' { |
| path = path[1:] |
| } |
| |
| node := m |
| for { |
| if path == "" { |
| return node |
| } |
| |
| index := strings.Index(path, "/") |
| var firstComponent string |
| if index >= 0 { |
| firstComponent = path[:index] |
| path = path[index+1:] |
| } else { |
| firstComponent = path |
| path = "" |
| } |
| |
| child, found := node.children[firstComponent] |
| |
| if !found { |
| if createIfNotFound { |
| child = node.newChild(firstComponent) |
| } else { |
| return nil |
| } |
| } |
| |
| node = child |
| } |
| } |
| |
| func (m *pathMap) newChild(name string) (child *pathMap) { |
| path := joinCleanPaths(m.path, name) |
| newChild := newPathMap(path) |
| m.children[name] = newChild |
| |
| return m.children[name] |
| } |
| |
| func (m *pathMap) UpdateNumDescendents() int { |
| count := 1 |
| for _, child := range m.children { |
| count += child.approximateNumDescendents |
| } |
| m.approximateNumDescendents = count |
| return count |
| } |
| |
| func (m *pathMap) UpdateNumDescendentsRecursive() { |
| for _, child := range m.children { |
| child.UpdateNumDescendentsRecursive() |
| } |
| m.UpdateNumDescendents() |
| } |
| |
| func (m *pathMap) MergeIn(other *pathMap) { |
| for key, theirs := range other.children { |
| ours, found := m.children[key] |
| if found { |
| ours.MergeIn(theirs) |
| } else { |
| m.children[key] = theirs |
| } |
| } |
| if other.ModTime != 0 { |
| m.mapNode = other.mapNode |
| } |
| m.UpdateNumDescendents() |
| } |
| |
| func (m *pathMap) DumpAll() []dirFullInfo { |
| results := []dirFullInfo{} |
| m.dumpInto("", &results) |
| return results |
| } |
| |
| func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) { |
| *results = append(*results, |
| dirFullInfo{ |
| pathAndStats{statResponse: m.statResponse, Path: path}, |
| m.FileNames}, |
| ) |
| for key, child := range m.children { |
| childPath := joinCleanPaths(path, key) |
| if len(childPath) == 0 || childPath[0] != '/' { |
| childPath = "/" + childPath |
| } |
| child.dumpInto(childPath, results) |
| } |
| } |
| |
| // a semaphore can be locked by up to <capacity> callers at once |
| type semaphore struct { |
| pool chan bool |
| } |
| |
| func newSemaphore(capacity int) *semaphore { |
| return &semaphore{pool: make(chan bool, capacity)} |
| } |
| |
| func (l *semaphore) Lock() { |
| l.pool <- true |
| } |
| |
| func (l *semaphore) Unlock() { |
| <-l.pool |
| } |
| |
| // A threadPool runs goroutines and supports throttling and waiting. |
| // Without throttling, Go may exhaust the maximum number of various resources, such as |
| // threads or file descriptors, and crash the program. |
| type threadPool struct { |
| receivedRequests sync.WaitGroup |
| activeRequests semaphore |
| } |
| |
| func newThreadPool(maxNumConcurrentThreads int) *threadPool { |
| return &threadPool{ |
| receivedRequests: sync.WaitGroup{}, |
| activeRequests: *newSemaphore(maxNumConcurrentThreads), |
| } |
| } |
| |
| // Run requests to run the given function in its own goroutine |
| func (p *threadPool) Run(function func()) { |
| p.receivedRequests.Add(1) |
| // If Run() was called from within a goroutine spawned by this threadPool, |
| // then we may need to return from Run() before having capacity to actually |
| // run <function>. |
| // |
| // It's possible that the body of <function> contains a statement (such as a syscall) |
| // that will cause Go to pin it to a thread, or will contain a statement that uses |
| // another resource that is in short supply (such as a file descriptor), so we can't |
| // actually run <function> until we have capacity. |
| // |
| // However, the semaphore used for synchronization is implemented via a channel and |
| // shouldn't require a new thread for each access. |
| go func() { |
| p.activeRequests.Lock() |
| function() |
| p.activeRequests.Unlock() |
| p.receivedRequests.Done() |
| }() |
| } |
| |
| // Wait waits until all goroutines are done, just like sync.WaitGroup's Wait |
| func (p *threadPool) Wait() { |
| p.receivedRequests.Wait() |
| } |
| |
| type fsErr struct { |
| path string |
| err error |
| } |
| |
| func (e fsErr) String() string { |
| return e.path + ": " + e.err.Error() |
| } |
| |
| func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) { |
| // group each dirFullInfo by its Device, to avoid having to repeat it in the output |
| dirsByDevice := map[uint64][]PersistedDirInfo{} |
| for _, entry := range dirInfos { |
| _, found := dirsByDevice[entry.Device] |
| if !found { |
| dirsByDevice[entry.Device] = []PersistedDirInfo{} |
| } |
| dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device], |
| PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames}) |
| } |
| |
| cacheEntry := CacheEntry{} |
| |
| for device, infos := range dirsByDevice { |
| // find common prefix |
| prefix := "" |
| if len(infos) > 0 { |
| prefix = infos[0].P |
| } |
| for _, info := range infos { |
| for !strings.HasPrefix(info.P+"/", prefix+"/") { |
| prefix = filepath.Dir(prefix) |
| if prefix == "/" { |
| break |
| } |
| } |
| } |
| // remove common prefix |
| for i := range infos { |
| suffix := strings.Replace(infos[i].P, prefix, "", 1) |
| if len(suffix) > 0 && suffix[0] == '/' { |
| suffix = suffix[1:] |
| } |
| infos[i].P = suffix |
| } |
| |
| // turn the map (keyed by device) into a list of structs with labeled fields |
| // this is to improve readability of the output |
| cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos}) |
| } |
| |
| // convert to json. |
| // it would save some space to use a different format than json for the db file, |
| // but the space and time savings are small, and json is easy for humans to read |
| bytes, err := json.Marshal(cacheEntry) |
| return bytes, err |
| } |
| |
| func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) { |
| var cacheEntry CacheEntry |
| err := json.Unmarshal(bytes, &cacheEntry) |
| if err != nil { |
| return nil, err |
| } |
| |
| // convert from a CacheEntry to a []dirFullInfo (by copying a few fields) |
| capacity := 0 |
| for _, element := range cacheEntry { |
| capacity += len(element.Dirs) |
| } |
| nodes := make([]dirFullInfo, capacity) |
| count := 0 |
| for _, element := range cacheEntry { |
| for _, dir := range element.Dirs { |
| path := joinCleanPaths(element.Root, dir.P) |
| |
| nodes[count] = dirFullInfo{ |
| pathAndStats: pathAndStats{ |
| statResponse: statResponse{ |
| ModTime: dir.T, Inode: dir.I, Device: element.Device, |
| }, |
| Path: path}, |
| FileNames: dir.F} |
| count++ |
| } |
| } |
| return nodes, nil |
| } |
| |
| // We use the following separator byte to distinguish individually parseable blocks of json |
| // because we know this separator won't appear in the json that we're parsing. |
| // |
| // The newline byte can only appear in a UTF-8 stream if the newline character appears, because: |
| // - The newline character is encoded as "0000 1010" in binary ("0a" in hex) |
| // - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte |
| // character. |
| // |
| // We know that the newline character will never appear in our json string, because: |
| // - If a newline character appears as part of a data string, then json encoding will |
| // emit two characters instead: '\' and 'n'. |
| // - The json encoder that we use doesn't emit the optional newlines between any of its |
| // other outputs. |
| const lineSeparator = byte('\n') |
| |
| func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) { |
| return reader.ReadBytes(lineSeparator) |
| } |
| |
| // validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder |
| func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool { |
| cacheVersionBytes, err := f.readLine(cacheReader) |
| if err != nil { |
| f.verbosef("Failed to read database header; database is invalid\n") |
| return false |
| } |
| if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator { |
| cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1] |
| } |
| cacheVersionString := string(cacheVersionBytes) |
| currentVersion := f.cacheMetadata.Version |
| if cacheVersionString != currentVersion { |
| f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion) |
| return false |
| } |
| |
| cacheParamBytes, err := f.readLine(cacheReader) |
| if err != nil { |
| f.verbosef("Failed to read database search params; database is invalid\n") |
| return false |
| } |
| |
| if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator { |
| cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1] |
| } |
| |
| currentParamBytes, err := f.cacheMetadata.Config.Dump() |
| if err != nil { |
| panic("Finder failed to serialize its parameters") |
| } |
| cacheParamString := string(cacheParamBytes) |
| currentParamString := string(currentParamBytes) |
| if cacheParamString != currentParamString { |
| f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString) |
| return false |
| } |
| return true |
| } |
| |
| // loadBytes compares the cache info in <data> to the state of the filesystem |
| // loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked |
| func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) { |
| |
| helperStartTime := time.Now() |
| |
| cachedNodes, err := f.parseCacheEntry(data) |
| if err != nil { |
| return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error()) |
| } |
| |
| unmarshalDate := time.Now() |
| f.verbosef("Unmarshaled %v objects for %v in %v\n", |
| len(cachedNodes), id, unmarshalDate.Sub(helperStartTime)) |
| |
| tempMap := newPathMap("/") |
| stats := make([]statResponse, len(cachedNodes)) |
| |
| for i, node := range cachedNodes { |
| // check the file system for an updated timestamp |
| stats[i] = f.statDirSync(node.Path) |
| } |
| |
| dirsToWalk = []string{} |
| for i, cachedNode := range cachedNodes { |
| updated := stats[i] |
| // save the cached value |
| container := tempMap.GetNode(cachedNode.Path, true) |
| container.mapNode = mapNode{statResponse: updated} |
| |
| // if the metadata changed and the directory still exists, then |
| // make a note to walk it later |
| if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 { |
| f.setModified() |
| // make a note that the directory needs to be walked |
| dirsToWalk = append(dirsToWalk, cachedNode.Path) |
| } else { |
| container.mapNode.FileNames = cachedNode.FileNames |
| } |
| } |
| // count the number of nodes to improve our understanding of the shape of the tree, |
| // thereby improving parallelism of subsequent searches |
| tempMap.UpdateNumDescendentsRecursive() |
| |
| f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate)) |
| return tempMap, dirsToWalk, nil |
| } |
| |
| // startFromExternalCache loads the cache database from disk |
| // startFromExternalCache waits to return until the load of the cache db is complete, but |
| // startFromExternalCache does not wait for all every listDir() or statDir() request to complete |
| func (f *Finder) startFromExternalCache() (err error) { |
| startTime := time.Now() |
| dbPath := f.DbPath |
| |
| // open cache file and validate its header |
| reader, err := f.filesystem.Open(dbPath) |
| if err != nil { |
| return errors.New("No data to load from database\n") |
| } |
| defer reader.Close() |
| bufferedReader := bufio.NewReader(reader) |
| if !f.validateCacheHeader(bufferedReader) { |
| return errors.New("Cache header does not match") |
| } |
| f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath) |
| |
| // read the file and spawn threads to process it |
| nodesToWalk := [][]*pathMap{} |
| mainTree := newPathMap("/") |
| |
| // read the blocks and stream them into <blockChannel> |
| type dataBlock struct { |
| id int |
| err error |
| data []byte |
| } |
| blockChannel := make(chan dataBlock, f.numDbLoadingThreads) |
| readBlocks := func() { |
| index := 0 |
| for { |
| // It takes some time to unmarshal the input from json, so we want |
| // to unmarshal it in parallel. In order to find valid places to |
| // break the input, we scan for the line separators that we inserted |
| // (for this purpose) when we dumped the database. |
| data, err := f.readLine(bufferedReader) |
| var response dataBlock |
| done := false |
| if err != nil && err != io.EOF { |
| response = dataBlock{id: index, err: err, data: nil} |
| done = true |
| } else { |
| done = (err == io.EOF) |
| response = dataBlock{id: index, err: nil, data: data} |
| } |
| blockChannel <- response |
| index++ |
| duration := time.Since(startTime) |
| f.verbosef("Read block %v after %v\n", index, duration) |
| if done { |
| f.verbosef("Read %v blocks in %v\n", index, duration) |
| close(blockChannel) |
| return |
| } |
| } |
| } |
| go readBlocks() |
| |
| // Read from <blockChannel> and stream the responses into <resultChannel>. |
| type workResponse struct { |
| id int |
| err error |
| tree *pathMap |
| updatedDirs []string |
| } |
| resultChannel := make(chan workResponse) |
| processBlocks := func() { |
| numProcessed := 0 |
| threadPool := newThreadPool(f.numDbLoadingThreads) |
| for { |
| // get a block to process |
| block, received := <-blockChannel |
| if !received { |
| break |
| } |
| |
| if block.err != nil { |
| resultChannel <- workResponse{err: block.err} |
| break |
| } |
| numProcessed++ |
| // wait until there is CPU available to process it |
| threadPool.Run( |
| func() { |
| processStartTime := time.Now() |
| f.verbosef("Starting to process block %v after %v\n", |
| block.id, processStartTime.Sub(startTime)) |
| tempMap, updatedDirs, err := f.loadBytes(block.id, block.data) |
| var response workResponse |
| if err != nil { |
| f.verbosef( |
| "Block %v failed to parse with error %v\n", |
| block.id, err) |
| response = workResponse{err: err} |
| } else { |
| response = workResponse{ |
| id: block.id, |
| err: nil, |
| tree: tempMap, |
| updatedDirs: updatedDirs, |
| } |
| } |
| f.verbosef("Processed block %v in %v\n", |
| block.id, time.Since(processStartTime), |
| ) |
| resultChannel <- response |
| }, |
| ) |
| } |
| threadPool.Wait() |
| f.verbosef("Finished processing %v blocks in %v\n", |
| numProcessed, time.Since(startTime)) |
| close(resultChannel) |
| } |
| go processBlocks() |
| |
| // Read from <resultChannel> and use the results |
| combineResults := func() (err error) { |
| for { |
| result, received := <-resultChannel |
| if !received { |
| break |
| } |
| if err != nil { |
| // In case of an error, wait for work to complete before |
| // returning the error. This ensures that any subsequent |
| // work doesn't need to compete for resources (and possibly |
| // fail due to, for example, a filesystem limit on the number of |
| // concurrently open files) with past work. |
| continue |
| } |
| if result.err != nil { |
| err = result.err |
| continue |
| } |
| // update main tree |
| mainTree.MergeIn(result.tree) |
| // record any new directories that we will need to Stat() |
| updatedNodes := make([]*pathMap, len(result.updatedDirs)) |
| for j, dir := range result.updatedDirs { |
| node := mainTree.GetNode(dir, false) |
| updatedNodes[j] = node |
| } |
| nodesToWalk = append(nodesToWalk, updatedNodes) |
| } |
| return err |
| } |
| err = combineResults() |
| if err != nil { |
| return err |
| } |
| |
| f.nodes = *mainTree |
| |
| // after having loaded the entire db and therefore created entries for |
| // the directories we know of, now it's safe to start calling ReadDir on |
| // any updated directories |
| for i := range nodesToWalk { |
| f.listDirsAsync(nodesToWalk[i]) |
| } |
| f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime)) |
| f.threadPool.Wait() |
| f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime)) |
| |
| return err |
| } |
| |
| // startWithoutExternalCache starts scanning the filesystem according to the cache config |
| // startWithoutExternalCache should be called if startFromExternalCache is not applicable |
| func (f *Finder) startWithoutExternalCache() { |
| startTime := time.Now() |
| configDirs := f.cacheMetadata.Config.RootDirs |
| |
| // clean paths |
| candidates := make([]string, len(configDirs)) |
| for i, dir := range configDirs { |
| candidates[i] = filepath.Clean(dir) |
| } |
| // remove duplicates |
| dirsToScan := make([]string, 0, len(configDirs)) |
| for _, candidate := range candidates { |
| include := true |
| for _, included := range dirsToScan { |
| if included == "/" || strings.HasPrefix(candidate+"/", included+"/") { |
| include = false |
| break |
| } |
| } |
| if include { |
| dirsToScan = append(dirsToScan, candidate) |
| } |
| } |
| |
| // start searching finally |
| for _, path := range dirsToScan { |
| f.verbosef("Starting find of %v\n", path) |
| f.startFind(path) |
| } |
| |
| f.threadPool.Wait() |
| |
| f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime)) |
| } |
| |
| // isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid |
| func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) { |
| if old.Inode != new.Inode { |
| return false |
| } |
| if old.ModTime != new.ModTime { |
| return false |
| } |
| if old.Device != new.Device { |
| return false |
| } |
| return true |
| } |
| |
| func (f *Finder) wasModified() bool { |
| return atomic.LoadInt32(&f.modifiedFlag) > 0 |
| } |
| |
| func (f *Finder) setModified() { |
| var newVal int32 |
| newVal = 1 |
| atomic.StoreInt32(&f.modifiedFlag, newVal) |
| } |
| |
| // sortedDirEntries exports directory entries to facilitate dumping them to the external cache |
| func (f *Finder) sortedDirEntries() []dirFullInfo { |
| startTime := time.Now() |
| nodes := make([]dirFullInfo, 0) |
| for _, node := range f.nodes.DumpAll() { |
| if node.ModTime != 0 { |
| nodes = append(nodes, node) |
| } |
| } |
| discoveryDate := time.Now() |
| f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime)) |
| less := func(i int, j int) bool { |
| return nodes[i].Path < nodes[j].Path |
| } |
| sort.Slice(nodes, less) |
| sortDate := time.Now() |
| f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate)) |
| |
| return nodes |
| } |
| |
| // serializeDb converts the cache database into a form to save to disk |
| func (f *Finder) serializeDb() ([]byte, error) { |
| // sort dir entries |
| var entryList = f.sortedDirEntries() |
| |
| // Generate an output file that can be conveniently loaded using the same number of threads |
| // as were used in this execution (because presumably that will be the number of threads |
| // used in the next execution too) |
| |
| // generate header |
| header := []byte{} |
| header = append(header, []byte(f.cacheMetadata.Version)...) |
| header = append(header, lineSeparator) |
| configDump, err := f.cacheMetadata.Config.Dump() |
| if err != nil { |
| return nil, err |
| } |
| header = append(header, configDump...) |
| |
| // serialize individual blocks in parallel |
| numBlocks := f.numDbLoadingThreads |
| if numBlocks > len(entryList) { |
| numBlocks = len(entryList) |
| } |
| blocks := make([][]byte, 1+numBlocks) |
| blocks[0] = header |
| blockMin := 0 |
| wg := sync.WaitGroup{} |
| var errLock sync.Mutex |
| |
| for i := 1; i <= numBlocks; i++ { |
| // identify next block |
| blockMax := len(entryList) * i / numBlocks |
| block := entryList[blockMin:blockMax] |
| |
| // process block |
| wg.Add(1) |
| go func(index int, block []dirFullInfo) { |
| byteBlock, subErr := f.serializeCacheEntry(block) |
| f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock)) |
| if subErr != nil { |
| f.verbosef("%v\n", subErr.Error()) |
| errLock.Lock() |
| err = subErr |
| errLock.Unlock() |
| } else { |
| blocks[index] = byteBlock |
| } |
| wg.Done() |
| }(i, block) |
| |
| blockMin = blockMax |
| } |
| |
| wg.Wait() |
| |
| if err != nil { |
| return nil, err |
| } |
| |
| content := bytes.Join(blocks, []byte{lineSeparator}) |
| |
| return content, nil |
| } |
| |
| // dumpDb saves the cache database to disk |
| func (f *Finder) dumpDb() error { |
| startTime := time.Now() |
| f.verbosef("Dumping db\n") |
| |
| tempPath := f.DbPath + ".tmp" |
| |
| bytes, err := f.serializeDb() |
| if err != nil { |
| return err |
| } |
| serializeDate := time.Now() |
| f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime)) |
| // dump file and atomically move |
| err = f.filesystem.WriteFile(tempPath, bytes, 0777) |
| if err != nil { |
| return err |
| } |
| err = f.filesystem.Rename(tempPath, f.DbPath) |
| if err != nil { |
| return err |
| } |
| |
| f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate)) |
| return nil |
| |
| } |
| |
| // canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore |
| func (f *Finder) canIgnoreFsErr(err error) bool { |
| pathErr, isPathErr := err.(*os.PathError) |
| if !isPathErr { |
| // Don't recognize this error |
| return false |
| } |
| if os.IsPermission(pathErr) { |
| // Permission errors are ignored: |
| // https://issuetracker.google.com/37553659 |
| // https://github.com/google/kati/pull/116 |
| return true |
| } |
| if pathErr.Err == os.ErrNotExist { |
| // If a directory doesn't exist, that generally means the cache is out-of-date |
| return true |
| } |
| // Don't recognize this error |
| return false |
| } |
| |
| // onFsError should be called whenever a potentially fatal error is returned from a filesystem call |
| func (f *Finder) onFsError(path string, err error) { |
| if !f.canIgnoreFsErr(err) { |
| // We could send the errors through a channel instead, although that would cause this call |
| // to block unless we preallocated a sufficient buffer or spawned a reader thread. |
| // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly |
| // more convenient to use a lock. Only in an unusual situation should this code be |
| // invoked anyway. |
| f.errlock.Lock() |
| f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err}) |
| f.errlock.Unlock() |
| } |
| } |
| |
| // discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache |
| func (f *Finder) discardErrsForPrunedPaths() { |
| // This function could be somewhat inefficient due to being single-threaded, |
| // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway. |
| relevantErrs := make([]fsErr, 0, len(f.fsErrs)) |
| for _, fsErr := range f.fsErrs { |
| path := fsErr.path |
| node := f.nodes.GetNode(path, false) |
| if node != nil { |
| // The path in question wasn't pruned due to a failure to process a parent directory. |
| // So, the failure to process this path is important |
| relevantErrs = append(relevantErrs, fsErr) |
| } |
| } |
| f.fsErrs = relevantErrs |
| } |
| |
| // getErr returns an error based on previous calls to onFsErr, if any |
| func (f *Finder) getErr() error { |
| f.discardErrsForPrunedPaths() |
| |
| numErrs := len(f.fsErrs) |
| if numErrs < 1 { |
| return nil |
| } |
| |
| maxNumErrsToInclude := 10 |
| message := "" |
| if numErrs > maxNumErrsToInclude { |
| message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude]) |
| } else { |
| message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs) |
| } |
| |
| return errors.New(message) |
| } |
| |
| func (f *Finder) statDirAsync(dir *pathMap) { |
| node := dir |
| path := dir.path |
| f.threadPool.Run( |
| func() { |
| updatedStats := f.statDirSync(path) |
| |
| if !f.isInfoUpToDate(node.statResponse, updatedStats) { |
| node.mapNode = mapNode{ |
| statResponse: updatedStats, |
| FileNames: []string{}, |
| } |
| f.setModified() |
| if node.statResponse.ModTime != 0 { |
| // modification time was updated, so re-scan for |
| // child directories |
| f.listDirAsync(dir) |
| } |
| } |
| }, |
| ) |
| } |
| |
| func (f *Finder) statDirSync(path string) statResponse { |
| |
| fileInfo, err := f.filesystem.Lstat(path) |
| |
| var stats statResponse |
| if err != nil { |
| // possibly record this error |
| f.onFsError(path, err) |
| // in case of a failure to stat the directory, treat the directory as missing (modTime = 0) |
| return stats |
| } |
| modTime := fileInfo.ModTime() |
| stats = statResponse{} |
| inode, err := f.filesystem.InodeNumber(fileInfo) |
| if err != nil { |
| panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error())) |
| } |
| stats.Inode = inode |
| device, err := f.filesystem.DeviceNumber(fileInfo) |
| if err != nil { |
| panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error())) |
| } |
| stats.Device = device |
| permissionsChangeTime, err := f.filesystem.PermTime(fileInfo) |
| |
| if err != nil { |
| panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error())) |
| } |
| // We're only interested in knowing whether anything about the directory |
| // has changed since last check, so we use the latest of the two |
| // modification times (content modification (mtime) and |
| // permission modification (ctime)) |
| if permissionsChangeTime.After(modTime) { |
| modTime = permissionsChangeTime |
| } |
| stats.ModTime = modTime.UnixNano() |
| |
| return stats |
| } |
| |
| func (f *Finder) shouldIncludeFile(fileName string) bool { |
| for _, includedName := range f.cacheMetadata.Config.IncludeFiles { |
| if fileName == includedName { |
| return true |
| } |
| } |
| for _, includeSuffix := range f.cacheMetadata.Config.IncludeSuffixes { |
| if strings.HasSuffix(fileName, includeSuffix) { |
| return true |
| } |
| } |
| return false |
| } |
| |
| // pruneCacheCandidates removes the items that we don't want to include in our persistent cache |
| func (f *Finder) pruneCacheCandidates(items *DirEntries) { |
| |
| for _, fileName := range items.FileNames { |
| for _, abortedName := range f.cacheMetadata.Config.PruneFiles { |
| if fileName == abortedName { |
| items.FileNames = []string{} |
| items.DirNames = []string{} |
| return |
| } |
| } |
| } |
| |
| // remove any files that aren't the ones we want to include |
| writeIndex := 0 |
| for _, fileName := range items.FileNames { |
| if f.shouldIncludeFile(fileName) { |
| items.FileNames[writeIndex] = fileName |
| writeIndex++ |
| } |
| } |
| // resize |
| items.FileNames = items.FileNames[:writeIndex] |
| |
| writeIndex = 0 |
| for _, dirName := range items.DirNames { |
| items.DirNames[writeIndex] = dirName |
| // ignore other dirs that are known to not be inputs to the build process |
| include := true |
| for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs { |
| if dirName == excludedName { |
| // don't include |
| include = false |
| break |
| } |
| } |
| if include { |
| writeIndex++ |
| } |
| } |
| // resize |
| items.DirNames = items.DirNames[:writeIndex] |
| } |
| |
| func (f *Finder) listDirsAsync(nodes []*pathMap) { |
| f.threadPool.Run( |
| func() { |
| for i := range nodes { |
| f.listDirSync(nodes[i]) |
| } |
| }, |
| ) |
| } |
| |
| func (f *Finder) listDirAsync(node *pathMap) { |
| f.threadPool.Run( |
| func() { |
| f.listDirSync(node) |
| }, |
| ) |
| } |
| |
| func (f *Finder) listDirSync(dir *pathMap) { |
| path := dir.path |
| children, err := f.filesystem.ReadDir(path) |
| |
| if err != nil { |
| // possibly record this error |
| f.onFsError(path, err) |
| // if listing the contents of the directory fails (presumably due to |
| // permission denied), then treat the directory as empty |
| children = nil |
| } |
| |
| var subdirs []string |
| var subfiles []string |
| |
| for _, child := range children { |
| linkBits := child.Mode() & os.ModeSymlink |
| isLink := linkBits != 0 |
| if isLink { |
| childPath := filepath.Join(path, child.Name()) |
| childStat, err := f.filesystem.Stat(childPath) |
| if err != nil { |
| // If stat fails this is probably a broken or dangling symlink, treat it as a file. |
| subfiles = append(subfiles, child.Name()) |
| } else if childStat.IsDir() { |
| // Skip symlink dirs if not requested otherwise. Android has a number |
| // of symlinks creating infinite source trees which would otherwise get |
| // us in an infinite loop. |
| // TODO(b/197349722): Revisit this once symlink loops are banned in the |
| // source tree. |
| if f.cacheMetadata.Config.FollowSymlinks { |
| subdirs = append(subdirs, child.Name()) |
| } |
| } else { |
| // We do have to support symlink files because the link name might be |
| // different than the target name |
| // (for example, Android.bp -> build/soong/root.bp) |
| subfiles = append(subfiles, child.Name()) |
| } |
| } else if child.IsDir() { |
| subdirs = append(subdirs, child.Name()) |
| } else { |
| subfiles = append(subfiles, child.Name()) |
| } |
| |
| } |
| parentNode := dir |
| |
| entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles} |
| f.pruneCacheCandidates(entry) |
| |
| // create a pathMap node for each relevant subdirectory |
| relevantChildren := map[string]*pathMap{} |
| for _, subdirName := range entry.DirNames { |
| childNode, found := parentNode.children[subdirName] |
| // if we already knew of this directory, then we already have a request pending to Stat it |
| // if we didn't already know of this directory, then we must Stat it now |
| if !found { |
| childNode = parentNode.newChild(subdirName) |
| f.statDirAsync(childNode) |
| } |
| relevantChildren[subdirName] = childNode |
| } |
| // Note that in rare cases, it's possible that we're reducing the set of |
| // children via this statement, if these are all true: |
| // 1. we previously had a cache that knew about subdirectories of parentNode |
| // 2. the user created a prune-file (described in pruneCacheCandidates) |
| // inside <parentNode>, which specifies that the contents of parentNode |
| // are to be ignored. |
| // The fact that it's possible to remove children here means that *pathMap structs |
| // must not be looked up from f.nodes by filepath (and instead must be accessed by |
| // direct pointer) until after every listDirSync completes |
| parentNode.FileNames = entry.FileNames |
| parentNode.children = relevantChildren |
| |
| } |
| |
| // listMatches takes a node and a function that specifies which subdirectories and |
| // files to include, and listMatches returns the matches |
| func (f *Finder) listMatches(node *pathMap, |
| filter WalkFunc) (subDirs []*pathMap, filePaths []string) { |
| entries := DirEntries{ |
| FileNames: node.FileNames, |
| } |
| entries.DirNames = make([]string, 0, len(node.children)) |
| for childName := range node.children { |
| entries.DirNames = append(entries.DirNames, childName) |
| } |
| |
| dirNames, fileNames := filter(entries) |
| |
| subDirs = []*pathMap{} |
| filePaths = make([]string, 0, len(fileNames)) |
| for _, fileName := range fileNames { |
| filePaths = append(filePaths, joinCleanPaths(node.path, fileName)) |
| } |
| subDirs = make([]*pathMap, 0, len(dirNames)) |
| for _, childName := range dirNames { |
| child, ok := node.children[childName] |
| if ok { |
| subDirs = append(subDirs, child) |
| } |
| } |
| |
| return subDirs, filePaths |
| } |
| |
| // findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache. |
| func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc, |
| approxNumThreads int) []string { |
| |
| if approxNumThreads < 2 { |
| // Done spawning threads; process remaining directories |
| return f.findInCacheSinglethreaded(node, filter) |
| } |
| |
| totalWork := 0 |
| for _, child := range node.children { |
| totalWork += child.approximateNumDescendents |
| } |
| childrenResults := make(chan []string, len(node.children)) |
| |
| subDirs, filePaths := f.listMatches(node, filter) |
| |
| // process child directories |
| for _, child := range subDirs { |
| numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork |
| childProcessor := func(child *pathMap) { |
| childResults := f.findInCacheMultithreaded(child, filter, numChildThreads) |
| childrenResults <- childResults |
| } |
| // If we're allowed to use more than 1 thread to process this directory, |
| // then instead we use 1 thread for each subdirectory. |
| // It would be strange to spawn threads for only some subdirectories. |
| go childProcessor(child) |
| } |
| |
| // collect results |
| for i := 0; i < len(subDirs); i++ { |
| childResults := <-childrenResults |
| filePaths = append(filePaths, childResults...) |
| } |
| close(childrenResults) |
| |
| return filePaths |
| } |
| |
| // findInCacheSinglethreaded synchronously searches the cache for all matching file paths |
| // note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive |
| func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string { |
| if node == nil { |
| return []string{} |
| } |
| |
| nodes := []*pathMap{node} |
| matches := []string{} |
| |
| for len(nodes) > 0 { |
| currentNode := nodes[0] |
| nodes = nodes[1:] |
| |
| subDirs, filePaths := f.listMatches(currentNode, filter) |
| |
| nodes = append(nodes, subDirs...) |
| |
| matches = append(matches, filePaths...) |
| } |
| return matches |
| } |