Skip to content

Commit aa052c7

Browse files
committed
Feat: add advanced deduplicate for subdomains
only for domain/full without attr
1 parent 68d291d commit aa052c7

1 file changed

Lines changed: 74 additions & 16 deletions

File tree

main.go

Lines changed: 74 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,76 @@ func ParseList(refList *List) error {
246246
return nil
247247
}
248248

249+
func polishList(rl *[]Entry) []Entry {
250+
// Remove basic duplicates
251+
pendingList := make([]Entry, 0, len(*rl)) // Exactly same entries removed
252+
entry2String := func(e Entry) string { // Attributes already sorted
253+
return e.Type + ":" + e.Value + "@" + strings.Join(e.Attrs, "@")
254+
}
255+
bscDupMap := make(map[string]bool)
256+
for _, entry := range *rl {
257+
if estring := entry2String(entry); !bscDupMap[estring] {
258+
bscDupMap[estring] = true
259+
pendingList = append(pendingList, entry)
260+
}
261+
}
262+
263+
finalList := make([]Entry, 0, len(pendingList))
264+
queuingList := make([]Entry, 0, len(pendingList)) // Domain/full entries without attr
265+
domainsMap := make(map[string]bool)
266+
for _, entry := range pendingList {
267+
switch entry.Type { // Bypass regexp, keyword and "full/domain with attr"
268+
case RuleTypeRegexp:
269+
finalList = append(finalList, entry)
270+
case RuleTypeKeyword:
271+
finalList = append(finalList, entry)
272+
case RuleTypeDomain:
273+
domainsMap[entry.Value] = true
274+
if len(entry.Attrs) != 0 {
275+
finalList = append(finalList, entry)
276+
} else {
277+
queuingList = append(queuingList, entry)
278+
}
279+
case RuleTypeFullDomain:
280+
if len(entry.Attrs) != 0 {
281+
finalList = append(finalList, entry)
282+
} else {
283+
queuingList = append(queuingList, entry)
284+
}
285+
}
286+
}
287+
288+
// Remove redundant subdomains for full/domain without attr
289+
for _, qentry := range queuingList {
290+
parts := strings.Split(qentry.Value, ".")
291+
isRedundant := false
292+
for i := 1; i < len(parts) - 1 ; i++ {
293+
// Not check parent for level2 "name.tld" domain / tld will not become a parent
294+
parentdomain := strings.Join(parts[i:], ".")
295+
if domainsMap[parentdomain] {
296+
isRedundant = true
297+
break
298+
}
299+
}
300+
if !isRedundant {
301+
finalList = append(finalList, qentry)
302+
}
303+
}
304+
305+
// Sort final entries
306+
sort.Slice(finalList, func(i, j int) bool {
307+
if finalList[i].Type != finalList[j].Type {
308+
return finalList[i].Type < finalList[j].Type
309+
}
310+
if finalList[i].Value != finalList[j].Value {
311+
return finalList[i].Value < finalList[j].Value
312+
}
313+
// Ideally, the comparison here will not be triggered by source data
314+
return strings.Join(finalList[i].Attrs, ",") < strings.Join(finalList[j].Attrs, ",")
315+
})
316+
return finalList
317+
}
318+
249319
func ResolveList(pl *ParsedList) error {
250320
if _, pldone := finalMap[pl.Name]; pldone { return nil }
251321

@@ -255,9 +325,6 @@ func ResolveList(pl *ParsedList) error {
255325
cirIncMap[pl.Name] = true
256326
defer delete(cirIncMap, pl.Name)
257327

258-
entry2String := func(e Entry) string { // Attributes already sorted
259-
return e.Type + ":" + e.Value + "@" + strings.Join(e.Attrs, "@")
260-
}
261328
isMatchAttrFilters := func(entry Entry, incFilter Inclusion) bool {
262329
if len(incFilter.MustAttrs) == 0 && len(incFilter.BanAttrs) == 0 { return true }
263330
if len(entry.Attrs) == 0 { return len(incFilter.MustAttrs) == 0 }
@@ -275,14 +342,8 @@ func ResolveList(pl *ParsedList) error {
275342
return true
276343
}
277344

278-
bscDupMap := make(map[string]bool) // Used for basic duplicates detection
279-
var finalList []Entry
280-
for _, dentry := range pl.Entry {
281-
if dstring := entry2String(dentry); !bscDupMap[dstring] {
282-
bscDupMap[dstring] = true
283-
finalList = append(finalList, dentry)
284-
}
285-
}
345+
var roughList []Entry
346+
roughList = append(roughList, pl.Entry...)
286347

287348
for _, inc := range pl.Inclusions {
288349
incPl, exist := plMap[inc.Source]
@@ -294,14 +355,11 @@ func ResolveList(pl *ParsedList) error {
294355
}
295356
for _, ientry := range finalMap[inc.Source] {
296357
if isMatchAttrFilters(ientry, inc) {
297-
if istring := entry2String(ientry); !bscDupMap[istring] {
298-
bscDupMap[istring] = true
299-
finalList = append(finalList, ientry)
300-
}
358+
roughList = append(roughList, ientry)
301359
}
302360
}
303361
}
304-
finalMap[pl.Name] = finalList
362+
finalMap[pl.Name] = polishList(&roughList)
305363
return nil
306364
}
307365

0 commit comments

Comments
 (0)