package filtering import ( "bufio" "fmt" "hash/crc32" "io" "net/http" "os" "path/filepath" "strconv" "strings" "time" "github.com/AdguardTeam/golibs/errors" "github.com/AdguardTeam/golibs/log" "github.com/AdguardTeam/golibs/stringutil" "golang.org/x/exp/slices" ) // filterDir is the subdirectory of a data directory to store downloaded // filters. const filterDir = "filters" // nextFilterID is a way to seed a unique ID generation. // // TODO(e.burkov): Use more deterministic approach. var nextFilterID = time.Now().Unix() // FilterYAML respresents a filter list in the configuration file. // // TODO(e.burkov): Investigate if the field oredering is important. type FilterYAML struct { Enabled bool URL string // URL or a file path Name string `yaml:"name"` RulesCount int `yaml:"-"` LastUpdated time.Time `yaml:"-"` checksum uint32 // checksum of the file data white bool Filter `yaml:",inline"` } // Clear filter rules func (filter *FilterYAML) unload() { filter.RulesCount = 0 filter.checksum = 0 } // Path to the filter contents func (filter *FilterYAML) Path(dataDir string) string { return filepath.Join(dataDir, filterDir, strconv.FormatInt(filter.ID, 10)+".txt") } const ( statusFound = 1 << iota statusEnabledChanged statusURLChanged statusURLExists statusUpdateRequired ) // Update properties for a filter specified by its URL // Return status* flags. func (d *DNSFilter) filterSetProperties(url string, newf FilterYAML, whitelist bool) int { r := 0 d.filtersMu.Lock() defer d.filtersMu.Unlock() filters := d.Filters if whitelist { filters = d.WhitelistFilters } i := slices.IndexFunc(filters, func(filt FilterYAML) bool { return filt.URL == url }) if i == -1 { return 0 } filt := &filters[i] log.Debug("filter: set properties: %s: {%s %s %v}", filt.URL, newf.Name, newf.URL, newf.Enabled) filt.Name = newf.Name if filt.URL != newf.URL { r |= statusURLChanged | statusUpdateRequired if d.filterExistsNoLock(newf.URL) { return statusURLExists } filt.URL = newf.URL filt.unload() filt.LastUpdated = time.Time{} filt.checksum = 0 filt.RulesCount = 0 } if filt.Enabled != newf.Enabled { r |= statusEnabledChanged filt.Enabled = newf.Enabled if filt.Enabled { if (r & statusURLChanged) == 0 { err := d.load(filt) if err != nil { // TODO(e.burkov): It seems the error is only returned when // the file exists and couldn't be open. Investigate and // improve. log.Error("loading filter %d: %s", filt.ID, err) filt.LastUpdated = time.Time{} filt.checksum = 0 filt.RulesCount = 0 r |= statusUpdateRequired } } } else { filt.unload() } } return r | statusFound } // Return TRUE if a filter with this URL exists func (d *DNSFilter) filterExists(url string) bool { d.filtersMu.RLock() defer d.filtersMu.RUnlock() r := d.filterExistsNoLock(url) return r } func (d *DNSFilter) filterExistsNoLock(url string) bool { for _, f := range d.Filters { if f.URL == url { return true } } for _, f := range d.WhitelistFilters { if f.URL == url { return true } } return false } // Add a filter // Return FALSE if a filter with this URL exists func (d *DNSFilter) filterAdd(flt FilterYAML) bool { d.filtersMu.Lock() defer d.filtersMu.Unlock() // Check for duplicates if d.filterExistsNoLock(flt.URL) { return false } if flt.white { d.WhitelistFilters = append(d.WhitelistFilters, flt) } else { d.Filters = append(d.Filters, flt) } return true } // Load filters from the disk // And if any filter has zero ID, assign a new one func (d *DNSFilter) loadFilters(array []FilterYAML) { for i := range array { filter := &array[i] // otherwise we're operating on a copy if filter.ID == 0 { filter.ID = assignUniqueFilterID() } if !filter.Enabled { // No need to load a filter that is not enabled continue } err := d.load(filter) if err != nil { log.Error("Couldn't load filter %d contents due to %s", filter.ID, err) } } } func deduplicateFilters(filters []FilterYAML) (deduplicated []FilterYAML) { urls := stringutil.NewSet() lastIdx := 0 for _, filter := range filters { if !urls.Has(filter.URL) { urls.Add(filter.URL) filters[lastIdx] = filter lastIdx++ } } return filters[:lastIdx] } // Set the next filter ID to max(filter.ID) + 1 func updateUniqueFilterID(filters []FilterYAML) { for _, filter := range filters { if nextFilterID < filter.ID { nextFilterID = filter.ID + 1 } } } func assignUniqueFilterID() int64 { value := nextFilterID nextFilterID++ return value } // Sets up a timer that will be checking for filters updates periodically func (d *DNSFilter) periodicallyRefreshFilters() { const maxInterval = 1 * 60 * 60 intval := 5 // use a dynamically increasing time interval for { isNetErr, ok := false, false if d.FiltersUpdateIntervalHours != 0 { _, isNetErr, ok = d.tryRefreshFilters(true, true, false) if ok && !isNetErr { intval = maxInterval } } if isNetErr { intval *= 2 if intval > maxInterval { intval = maxInterval } } time.Sleep(time.Duration(intval) * time.Second) } } // tryRefreshFilters is like [refreshFilters], but backs down if the update is // already going on. // // TODO(e.burkov): Get rid of the concurrency pattern which requires the // sync.Mutex.TryLock. func (d *DNSFilter) tryRefreshFilters(block, allow, force bool) (updated int, isNetworkErr, ok bool) { if ok = d.refreshLock.TryLock(); !ok { return 0, false, ok } defer d.refreshLock.Unlock() updated, isNetworkErr = d.refreshFiltersIntl(block, allow, force) return updated, isNetworkErr, ok } // refreshFilters updates the lists and returns the number of updated ones. // It's safe for concurrent use, but blocks at least until the previous // refreshing is finished. func (d *DNSFilter) refreshFilters(block, allow, force bool) (updated int) { d.refreshLock.Lock() defer d.refreshLock.Unlock() updated, _ = d.refreshFiltersIntl(block, allow, force) return updated } // listsToUpdate returns the slice of filter lists that could be updated. func (d *DNSFilter) listsToUpdate(filters *[]FilterYAML, force bool) (toUpd []FilterYAML) { now := time.Now() d.filtersMu.RLock() defer d.filtersMu.RUnlock() for i := range *filters { flt := &(*filters)[i] // otherwise we will be operating on a copy log.Debug("checking list at index %d: %v", i, flt) if !flt.Enabled { continue } if !force { exp := flt.LastUpdated.Add(time.Duration(d.FiltersUpdateIntervalHours) * time.Hour) if now.Before(exp) { continue } } toUpd = append(toUpd, FilterYAML{ Filter: Filter{ ID: flt.ID, }, URL: flt.URL, Name: flt.Name, checksum: flt.checksum, }) } return toUpd } func (d *DNSFilter) refreshFiltersArray(filters *[]FilterYAML, force bool) (int, []FilterYAML, []bool, bool) { var updateFlags []bool // 'true' if filter data has changed updateFilters := d.listsToUpdate(filters, force) if len(updateFilters) == 0 { return 0, nil, nil, false } nfail := 0 for i := range updateFilters { uf := &updateFilters[i] updated, err := d.update(uf) updateFlags = append(updateFlags, updated) if err != nil { nfail++ log.Printf("Failed to update filter %s: %s\n", uf.URL, err) continue } } if nfail == len(updateFilters) { return 0, nil, nil, true } updateCount := 0 for i := range updateFilters { uf := &updateFilters[i] updated := updateFlags[i] d.filtersMu.Lock() for k := range *filters { f := &(*filters)[k] if f.ID != uf.ID || f.URL != uf.URL { continue } f.LastUpdated = uf.LastUpdated if !updated { continue } log.Info("Updated filter #%d. Rules: %d -> %d", f.ID, f.RulesCount, uf.RulesCount) f.Name = uf.Name f.RulesCount = uf.RulesCount f.checksum = uf.checksum updateCount++ } d.filtersMu.Unlock() } return updateCount, updateFilters, updateFlags, false } // refreshFiltersIntl checks filters and updates them if necessary. If force is // true, it ignores the filter.LastUpdated field value. // // Algorithm: // // 1. Get the list of filters to be updated. For each filter, run the download // and checksum check operation. Store downloaded data in a temporary file // inside data/filters directory // // 2. For each filter, if filter data hasn't changed, just set new update time // on file. Otherwise, rename the temporary file ( -> 1.txt). Note // that this method works only on Unix systems. On Windows, don't pass // files to filtering, pass the whole data. // // refreshFiltersIntl returns the number of updated filters. It also returns // true if there was a network error and nothing could be updated. // // TODO(a.garipov, e.burkov): What the hell? func (d *DNSFilter) refreshFiltersIntl(block, allow, force bool) (int, bool) { log.Debug("filtering: updating...") updNum := 0 var lists []FilterYAML var toUpd []bool isNetErr := false if block { updNum, lists, toUpd, isNetErr = d.refreshFiltersArray(&d.Filters, force) } if allow { updNumAl, listsAl, toUpdAl, isNetErrAl := d.refreshFiltersArray(&d.WhitelistFilters, force) updNum += updNumAl lists = append(lists, listsAl...) toUpd = append(toUpd, toUpdAl...) isNetErr = isNetErr || isNetErrAl } if isNetErr { return 0, true } if updNum != 0 { d.EnableFilters(false) for i := range lists { uf := &lists[i] updated := toUpd[i] if !updated { continue } _ = os.Remove(uf.Path(d.DataDir) + ".old") } } log.Debug("filtering: update finished") return updNum, false } // Allows printable UTF-8 text with CR, LF, TAB characters func isPrintableText(data []byte, len int) bool { for i := 0; i < len; i++ { c := data[i] if (c >= ' ' && c != 0x7f) || c == '\n' || c == '\r' || c == '\t' { continue } return false } return true } // A helper function that parses filter contents and returns a number of rules and a filter name (if there's any) func (d *DNSFilter) parseFilterContents(file io.Reader) (int, uint32, string) { rulesCount := 0 name := "" seenTitle := false r := bufio.NewReader(file) checksum := uint32(0) for { line, err := r.ReadString('\n') checksum = crc32.Update(checksum, crc32.IEEETable, []byte(line)) line = strings.TrimSpace(line) if len(line) == 0 { // } else if line[0] == '!' { m := d.filterTitleRegexp.FindAllStringSubmatch(line, -1) if len(m) > 0 && len(m[0]) >= 2 && !seenTitle { name = m[0][1] seenTitle = true } } else if line[0] == '#' { // } else { rulesCount++ } if err != nil { break } } return rulesCount, checksum, name } // Perform upgrade on a filter and update LastUpdated value func (d *DNSFilter) update(filter *FilterYAML) (bool, error) { b, err := d.updateIntl(filter) filter.LastUpdated = time.Now() if !b { e := os.Chtimes(filter.Path(d.DataDir), filter.LastUpdated, filter.LastUpdated) if e != nil { log.Error("os.Chtimes(): %v", e) } } return b, err } func (d *DNSFilter) read(reader io.Reader, tmpFile *os.File, filter *FilterYAML) (int, error) { htmlTest := true firstChunk := make([]byte, 4*1024) firstChunkLen := 0 buf := make([]byte, 64*1024) total := 0 for { n, err := reader.Read(buf) total += n if htmlTest { num := len(firstChunk) - firstChunkLen if n < num { num = n } copied := copy(firstChunk[firstChunkLen:], buf[:num]) firstChunkLen += copied if firstChunkLen == len(firstChunk) || err == io.EOF { if !isPrintableText(firstChunk, firstChunkLen) { return total, fmt.Errorf("data contains non-printable characters") } s := strings.ToLower(string(firstChunk)) if strings.Contains(s, "