Skip to content

Commit 3de6655

Browse files
committed
Add filtering capability
1 parent 6e65165 commit 3de6655

3 files changed

Lines changed: 156 additions & 21 deletions

File tree

create-url-list/README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,19 @@ go build
1111
## Usage
1212

1313
```bash
14-
./create-url-list [--quiet] <csv-file-path> [range] [output-path]
14+
./create-url-list [--quiet] [--contains <substring>] <csv-file-path> [range] [output-path]
1515
```
1616

1717
### Arguments
1818

1919
1. **--quiet** (optional): Suppress all informational output (warnings, info messages, and success messages). Only errors will be displayed. Useful when using this tool in pipelines.
20-
2. **csv-file-path** (required): Path to the input CSV file
21-
3. **range** (optional): Rank range in format `min-max` (e.g., `1-50`). Default: `1-250`
20+
2. **--contains** (optional): Filter URLs to only include those containing the specified substring. For example, `--contains /manual/` will only include URLs that contain `/manual/` in their path.
21+
3. **csv-file-path** (required): Path to the input CSV file
22+
4. **range** (optional): Rank range in format `min-max` (e.g., `1-50`). Default: `1-250`
2223
- Specifies which ranked entries to include in the output
2324
- `1-50` means "get the top 50 pages by pageviews"
2425
- `51-100` means "get pages ranked 51-100 by pageviews"
25-
4. **output-path** (optional): Custom output file path. Default: `output/YYYY-MM-DD_HH-MM-SS_range.csv`
26+
5. **output-path** (optional): Custom output file path. Default: `output/YYYY-MM-DD_HH-MM-SS_range.csv`
2627

2728
### Examples
2829

@@ -39,8 +40,17 @@ go build
3940
# Specify custom output path
4041
./create-url-list data.csv 1-100 results/top-100.csv
4142

43+
# Filter for URLs containing "/manual/" (e.g., database manual documentation)
44+
./create-url-list --contains /manual/ data.csv
45+
46+
# Filter for URLs containing "/manual/" and get top 50
47+
./create-url-list --contains /manual/ data.csv 1-50
48+
4249
# Use in a pipeline with quiet mode (no informational output)
4350
./create-url-list --quiet data.csv 1-50 output.csv
51+
52+
# Combine multiple flags: quiet mode with URL filtering
53+
./create-url-list --quiet --contains /manual/ data.csv 1-50 output.csv
4454
```
4555

4656
## Input Requirements

create-url-list/main.go

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,32 @@ func main() {
3434
func run() error {
3535
// Parse command-line arguments
3636
if len(os.Args) < 2 {
37-
return fmt.Errorf("usage: %s [--quiet] <csv-file-path> [range] [output-path]", os.Args[0])
37+
return fmt.Errorf("usage: %s [--quiet] [--contains <substring>] <csv-file-path> [range] [output-path]", os.Args[0])
3838
}
3939

40-
// Check for --quiet flag
40+
// Check for --quiet and --contains flags
4141
quiet := false
42+
containsFilter := ""
4243
args := os.Args[1:]
43-
if len(args) > 0 && args[0] == "--quiet" {
44-
quiet = true
45-
args = args[1:] // Remove --quiet from args
44+
45+
// Process flags
46+
for len(args) > 0 && strings.HasPrefix(args[0], "--") {
47+
if args[0] == "--quiet" {
48+
quiet = true
49+
args = args[1:] // Remove --quiet from args
50+
} else if args[0] == "--contains" {
51+
if len(args) < 2 {
52+
return fmt.Errorf("--contains flag requires a substring argument")
53+
}
54+
containsFilter = args[1]
55+
args = args[2:] // Remove --contains and its argument from args
56+
} else {
57+
return fmt.Errorf("unknown flag: %s", args[0])
58+
}
4659
}
4760

4861
if len(args) < 1 {
49-
return fmt.Errorf("usage: %s [--quiet] <csv-file-path> [range] [output-path]", os.Args[0])
62+
return fmt.Errorf("usage: %s [--quiet] [--contains <substring>] <csv-file-path> [range] [output-path]", os.Args[0])
5063
}
5164

5265
inputPath := args[0]
@@ -82,7 +95,7 @@ func run() error {
8295
}
8396

8497
// Read and process CSV
85-
records, err := processCSV(inputPath, config.IgnoreURLs, quiet)
98+
records, err := processCSV(inputPath, config.IgnoreURLs, containsFilter, quiet)
8699
if err != nil {
87100
return err
88101
}
@@ -145,7 +158,7 @@ func loadConfig(configPath string) (*Config, error) {
145158
return &config, nil
146159
}
147160

148-
func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, error) {
161+
func processCSV(inputPath string, ignoreURLs []string, containsFilter string, quiet bool) ([]Record, error) {
149162
file, err := os.Open(inputPath)
150163
if err != nil {
151164
return nil, fmt.Errorf("failed to open file: %v", err)
@@ -188,6 +201,7 @@ func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, er
188201
var records []Record
189202
var skippedURLs []string
190203
var ignoredURLs []string
204+
var filteredURLs []string
191205
for {
192206
row, err := reader.Read()
193207
if err != nil {
@@ -217,6 +231,12 @@ func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, er
217231
continue
218232
}
219233

234+
// Filter by contains substring if specified
235+
if containsFilter != "" && !strings.Contains(page, containsFilter) {
236+
filteredURLs = append(filteredURLs, page)
237+
continue
238+
}
239+
220240
// Parse Measure Values
221241
measureValue, err := strconv.Atoi(row[measureValuesIdx])
222242
if err != nil {
@@ -245,6 +265,14 @@ func processCSV(inputPath string, ignoreURLs []string, quiet bool) ([]Record, er
245265
}
246266
}
247267

268+
// Report filtered URLs
269+
if !quiet && len(filteredURLs) > 0 {
270+
fmt.Fprintf(os.Stderr, "Info: Filtered out %d URL(s) not containing '%s':\n", len(filteredURLs), containsFilter)
271+
for _, url := range filteredURLs {
272+
fmt.Fprintf(os.Stderr, " - %s\n", url)
273+
}
274+
}
275+
248276
return records, nil
249277
}
250278

create-url-list/main_test.go

Lines changed: 106 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func TestParseRange(t *testing.T) {
5353

5454
// TestProcessCSV_MissingColumns tests that processCSV returns an error when required columns are missing
5555
func TestProcessCSV_MissingColumns(t *testing.T) {
56-
_, err := processCSV("testdata/missing-columns.csv", nil, false)
56+
_, err := processCSV("testdata/missing-columns.csv", nil, "", false)
5757
if err == nil {
5858
t.Error("processCSV() expected error for missing columns, got nil")
5959
}
@@ -65,7 +65,7 @@ func TestProcessCSV_MissingColumns(t *testing.T) {
6565

6666
// TestProcessCSV_InvalidURL tests that processCSV skips URLs that don't start with www.
6767
func TestProcessCSV_InvalidURL(t *testing.T) {
68-
records, err := processCSV("testdata/invalid-url.csv", nil, false)
68+
records, err := processCSV("testdata/invalid-url.csv", nil, "", false)
6969
if err != nil {
7070
t.Errorf("processCSV() unexpected error: %v", err)
7171
}
@@ -96,7 +96,7 @@ func TestProcessCSV_ValidFiltering(t *testing.T) {
9696

9797
for _, tt := range tests {
9898
t.Run(tt.name, func(t *testing.T) {
99-
records, err := processCSV(tt.file, nil, false)
99+
records, err := processCSV(tt.file, nil, "", false)
100100
if err != nil {
101101
t.Fatalf("processCSV() unexpected error: %v", err)
102102
}
@@ -115,7 +115,7 @@ func TestProcessCSV_ValidFiltering(t *testing.T) {
115115

116116
// TestProcessCSV_EmptyFile tests that processCSV handles empty CSV files
117117
func TestProcessCSV_EmptyFile(t *testing.T) {
118-
records, err := processCSV("testdata/empty.csv", nil, false)
118+
records, err := processCSV("testdata/empty.csv", nil, "", false)
119119
if err != nil {
120120
t.Fatalf("processCSV() unexpected error: %v", err)
121121
}
@@ -126,7 +126,7 @@ func TestProcessCSV_EmptyFile(t *testing.T) {
126126

127127
// TestProcessCSV_FileNotFound tests that processCSV returns an error for non-existent files
128128
func TestProcessCSV_FileNotFound(t *testing.T) {
129-
_, err := processCSV("testdata/nonexistent.csv", nil, false)
129+
_, err := processCSV("testdata/nonexistent.csv", nil, "", false)
130130
if err == nil {
131131
t.Error("processCSV() expected error for non-existent file, got nil")
132132
}
@@ -311,7 +311,7 @@ func splitLines(s string) []string {
311311

312312
// TestProcessCSV_OnlyPageviewsFiltered tests that only Pageviews rows are included
313313
func TestProcessCSV_OnlyPageviewsFiltered(t *testing.T) {
314-
records, err := processCSV("testdata/more-data.csv", nil, false)
314+
records, err := processCSV("testdata/more-data.csv", nil, "", false)
315315
if err != nil {
316316
t.Fatalf("processCSV() unexpected error: %v", err)
317317
}
@@ -352,7 +352,7 @@ func TestProcessCSV_URLValidation(t *testing.T) {
352352
t.Fatalf("Failed to create test file: %v", err)
353353
}
354354

355-
records, err := processCSV(tmpFile, nil, false)
355+
records, err := processCSV(tmpFile, nil, "", false)
356356
if err != nil {
357357
t.Errorf("processCSV() unexpected error for URL %q: %v", tt.url, err)
358358
}
@@ -403,7 +403,7 @@ func TestIntegration_EndToEnd(t *testing.T) {
403403
outputPath := filepath.Join(tmpDir, "result.csv")
404404

405405
// Process the valid-with-filtering.csv file
406-
records, err := processCSV("testdata/valid-with-filtering.csv", nil, false)
406+
records, err := processCSV("testdata/valid-with-filtering.csv", nil, "", false)
407407
if err != nil {
408408
t.Fatalf("processCSV() unexpected error: %v", err)
409409
}
@@ -458,7 +458,7 @@ www.example.com/page4,Pageviews,400
458458

459459
// Test with ignore list
460460
ignoreURLs := []string{"www.example.com/page2", "www.example.com/page4"}
461-
records, err := processCSV(tmpFile, ignoreURLs, false)
461+
records, err := processCSV(tmpFile, ignoreURLs, "", false)
462462
if err != nil {
463463
t.Fatalf("processCSV() unexpected error: %v", err)
464464
}
@@ -492,6 +492,103 @@ www.example.com/page4,Pageviews,400
492492
}
493493
}
494494

495+
// TestProcessCSV_ContainsFilter tests that URLs are filtered by substring
496+
func TestProcessCSV_ContainsFilter(t *testing.T) {
497+
tmpDir := t.TempDir()
498+
tmpFile := filepath.Join(tmpDir, "test.csv")
499+
500+
// Create test CSV with various URLs
501+
content := `Page,Measure Names,Measure Values
502+
www.example.com/manual/page1,Pageviews,100
503+
www.example.com/blog/post1,Pageviews,200
504+
www.example.com/manual/page2,Pageviews,150
505+
www.example.com/docs/guide,Pageviews,300
506+
www.example.com/manual/tutorial,Pageviews,250
507+
`
508+
if err := os.WriteFile(tmpFile, []byte(content), 0644); err != nil {
509+
t.Fatalf("Failed to create test file: %v", err)
510+
}
511+
512+
tests := []struct {
513+
name string
514+
containsFilter string
515+
expectedCount int
516+
expectedURLs []string
517+
}{
518+
{
519+
name: "no filter",
520+
containsFilter: "",
521+
expectedCount: 5,
522+
expectedURLs: []string{"www.example.com/manual/page1", "www.example.com/blog/post1", "www.example.com/manual/page2", "www.example.com/docs/guide", "www.example.com/manual/tutorial"},
523+
},
524+
{
525+
name: "filter for /manual/",
526+
containsFilter: "/manual/",
527+
expectedCount: 3,
528+
expectedURLs: []string{"www.example.com/manual/page1", "www.example.com/manual/page2", "www.example.com/manual/tutorial"},
529+
},
530+
{
531+
name: "filter for /blog/",
532+
containsFilter: "/blog/",
533+
expectedCount: 1,
534+
expectedURLs: []string{"www.example.com/blog/post1"},
535+
},
536+
{
537+
name: "filter for /docs/",
538+
containsFilter: "/docs/",
539+
expectedCount: 1,
540+
expectedURLs: []string{"www.example.com/docs/guide"},
541+
},
542+
{
543+
name: "filter with no matches",
544+
containsFilter: "/nonexistent/",
545+
expectedCount: 0,
546+
expectedURLs: []string{},
547+
},
548+
}
549+
550+
for _, tt := range tests {
551+
t.Run(tt.name, func(t *testing.T) {
552+
records, err := processCSV(tmpFile, nil, tt.containsFilter, false)
553+
if err != nil {
554+
t.Fatalf("processCSV() unexpected error: %v", err)
555+
}
556+
557+
if len(records) != tt.expectedCount {
558+
t.Errorf("processCSV() got %d records, want %d", len(records), tt.expectedCount)
559+
}
560+
561+
// Verify all expected URLs are present
562+
for _, expectedURL := range tt.expectedURLs {
563+
found := false
564+
for _, record := range records {
565+
if record.Page == expectedURL {
566+
found = true
567+
break
568+
}
569+
}
570+
if !found {
571+
t.Errorf("Expected URL %q not found in results", expectedURL)
572+
}
573+
}
574+
575+
// Verify no unexpected URLs are present
576+
for _, record := range records {
577+
found := false
578+
for _, expectedURL := range tt.expectedURLs {
579+
if record.Page == expectedURL {
580+
found = true
581+
break
582+
}
583+
}
584+
if !found {
585+
t.Errorf("Unexpected URL %q found in results", record.Page)
586+
}
587+
}
588+
})
589+
}
590+
}
591+
495592
// TestWriteOutput_ShowPageviews tests that pageviews column is added when enabled
496593
func TestWriteOutput_ShowPageviews(t *testing.T) {
497594
tmpDir := t.TempDir()

0 commit comments

Comments
 (0)