tokendiff/tokendiff.go at main · dacharyc/tokendiff · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
// Package tokendiff provides token-level diffing with delimiter support.
//
// Unlike traditional line-based diff tools, tokendiff operates at the token level
// and treats configurable delimiter characters as separate tokens. This allows
// for more precise diffs when comparing code or structured text.
//
// For example, when comparing:
//
//	someFunction(SomeType var)
//	someFunction(SomeOtherType var)
//
// A line-based diff would show the entire line changed. A word-based diff
// without delimiter awareness might show "someFunction(SomeType" changed to
// "someFunction(SomeOtherType". But tokendiff correctly identifies that only
// "SomeType" changed to "SomeOtherType" because it treats "(" as a delimiter.
//
// This package uses github.com/dacharyc/diffx to provide this functionality.
package tokendiff

import (
	"strings"

	"github.com/dacharyc/diffx"
)

// DefaultDelimiters contains the default set of delimiter characters.
// These are characters that are treated as separate tokens even when
// not surrounded by whitespace.
// NOTE: Original dwdiff has NO default delimiters (empty string).
// Words are split only on whitespace unless -d or -P is specified.
const DefaultDelimiters = ""

// DefaultWhitespace contains the default set of whitespace characters.
const DefaultWhitespace = " \t\n\r"

// Operation represents a diff operation type.
type Operation int

const (
	// Equal indicates the token is unchanged.
	Equal Operation = iota
	// Insert indicates the token was added.
	Insert
	// Delete indicates the token was removed.
	Delete
)

// String returns a human-readable representation of the operation.
func (o Operation) String() string {
	switch o {
	case Equal:
		return "Equal"
	case Insert:
		return "Insert"
	case Delete:
		return "Delete"
	default:
		return "Unknown"
	}
}

// Diff represents a single diff operation on a token.
type Diff struct {
	Type  Operation
	Token string
}

// Options configures the diff behavior.
type Options struct {
	// Delimiters is the set of characters to treat as separate tokens.
	// If empty, DefaultDelimiters is used.
	// This is ignored if UsePunctuation is true.
	Delimiters string

	// Whitespace is the set of characters to treat as whitespace (word separators).
	// If empty, DefaultWhitespace is used.
	Whitespace string

	// UsePunctuation, when true, uses Unicode punctuation characters as
	// delimiters instead of the Delimiters string. This matches dwdiff's
	// -P/--punctuation flag behavior.
	UsePunctuation bool

	// PreserveWhitespace, when true, includes whitespace as separate tokens.
	// When false (default), whitespace is used only to separate words and
	// is not included in the diff output.
	PreserveWhitespace bool

	// IgnoreCase, when true, performs case-insensitive comparison.
	// The original case is preserved in the output.
	IgnoreCase bool
}

// DefaultOptions returns Options with default settings.
func DefaultOptions() Options {
	return Options{
		Delimiters:         DefaultDelimiters,
		PreserveWhitespace: false,
	}
}

// DiffResult contains diff output along with position information
// needed to reconstruct original spacing for Equal content.
type DiffResult struct {
	Diffs      []Diff
	Text1      string     // original old text
	Text2      string     // original new text
	Positions1 []TokenPos // token positions in text1
	Positions2 []TokenPos // token positions in text2
}

// DiffTokens computes the diff between two token slices.
// It uses the Myers diff algorithm via diffx.
func DiffTokens(tokens1, tokens2 []string) []Diff {
	return diffTokensWithDiffx(tokens1, tokens2)
}

// diffTokensWithDiffx uses the diffx library for diffing.
// Uses histogram-style diff which produces cleaner output by avoiding
// spurious matches on common words like "the", "for", "in".
func diffTokensWithDiffx(tokens1, tokens2 []string) []Diff {
	ops := diffx.DiffHistogram(tokens1, tokens2)
	return diffxOpsToDiffs(ops, tokens1, tokens2)
}

// diffxOpsToDiffs converts diffx DiffOps to tokendiff Diffs.
func diffxOpsToDiffs(ops []diffx.DiffOp, tokens1, tokens2 []string) []Diff {
	var result []Diff

	for _, op := range ops {
		switch op.Type {
		case diffx.Equal:
			for i := op.AStart; i < op.AEnd; i++ {
				result = append(result, Diff{Type: Equal, Token: tokens1[i]})
			}
		case diffx.Delete:
			for i := op.AStart; i < op.AEnd; i++ {
				result = append(result, Diff{Type: Delete, Token: tokens1[i]})
			}
		case diffx.Insert:
			for i := op.BStart; i < op.BEnd; i++ {
				result = append(result, Diff{Type: Insert, Token: tokens2[i]})
			}
		}
	}

	return result
}

// DiffTokensRaw computes the diff without semantic cleanup.
// Use this when you need the raw Myers diff output.
func DiffTokensRaw(tokens1, tokens2 []string) []Diff {
	return diffTokensWithDiffx(tokens1, tokens2)
}

// DiffStrings tokenizes both strings and computes their diff.
func DiffStrings(text1, text2 string, opts Options) []Diff {
	tokens1 := Tokenize(text1, opts)
	tokens2 := Tokenize(text2, opts)

	if opts.IgnoreCase {
		return diffTokensIgnoreCase(tokens1, tokens2)
	}
	return DiffTokens(tokens1, tokens2)
}

// DiffStringsWithPositions tokenizes and diffs strings, returning position info.
// This allows formatters to preserve original spacing for Equal content.
func DiffStringsWithPositions(text1, text2 string, opts Options) DiffResult {
	tokens1, pos1 := TokenizeWithPositions(text1, opts)
	tokens2, pos2 := TokenizeWithPositions(text2, opts)

	var diffs []Diff
	if opts.IgnoreCase {
		diffs = diffTokensIgnoreCase(tokens1, tokens2)
	} else {
		diffs = DiffTokens(tokens1, tokens2)
	}

	return DiffResult{
		Diffs:      diffs,
		Text1:      text1,
		Text2:      text2,
		Positions1: pos1,
		Positions2: pos2,
	}
}

// diffTokensIgnoreCase computes diff with case-insensitive comparison,
// preserving original case in output.
func diffTokensIgnoreCase(tokens1, tokens2 []string) []Diff {
	// Create lowercased versions for comparison
	lower1 := make([]string, len(tokens1))
	lower2 := make([]string, len(tokens2))
	for i, t := range tokens1 {
		lower1[i] = strings.ToLower(t)
	}
	for i, t := range tokens2 {
		lower2[i] = strings.ToLower(t)
	}

	// Use diffx on lowercased tokens
	ops := diffx.DiffHistogram(lower1, lower2)

	// Convert back to diffs using original tokens
	var result []Diff
	for _, op := range ops {
		switch op.Type {
		case diffx.Equal:
			// For equal tokens, prefer the original from tokens2 (new file)
			for i := op.BStart; i < op.BEnd; i++ {
				result = append(result, Diff{Type: Equal, Token: tokens2[i]})
			}
		case diffx.Delete:
			for i := op.AStart; i < op.AEnd; i++ {
				result = append(result, Diff{Type: Delete, Token: tokens1[i]})
			}
		case diffx.Insert:
			for i := op.BStart; i < op.BEnd; i++ {
				result = append(result, Diff{Type: Insert, Token: tokens2[i]})
			}
		}
	}
	return result
}

// HasChanges returns true if the diff slice contains any non-Equal operations.
func HasChanges(diffs []Diff) bool {
	for _, d := range diffs {
		if d.Type != Equal {
			return true
		}
	}
	return false
}

// DiffStatistics holds statistics about a diff operation.
type DiffStatistics struct {
	OldWords      int // total words in old text
	NewWords      int // total words in new text
	DeletedWords  int // words deleted (present in old but not new)
	InsertedWords int // words inserted (present in new but not old)
	CommonWords   int // words common to both texts
}

// ComputeStatistics calculates statistics for a diff.
func ComputeStatistics(text1, text2 string, diffs []Diff, opts Options) DiffStatistics {
	tokens1 := Tokenize(text1, opts)
	tokens2 := Tokenize(text2, opts)

	var st DiffStatistics
	st.OldWords = len(tokens1)
	st.NewWords = len(tokens2)

	for _, d := range diffs {
		switch d.Type {
		case Equal:
			st.CommonWords++
		case Delete:
			st.DeletedWords++
		case Insert:
			st.InsertedWords++
		}
	}

	return st
}

// DiffTokensWithPreprocessing computes the diff using histogram-style preprocessing.
// This uses diffx's histogram diff algorithm which:
// 1. Filters stopwords (common words like "the", "for", "in") from anchor selection
// 2. Uses low-frequency tokens as anchors for divide-and-conquer
// 3. Produces cleaner output without spurious matches on common words
//
// This produces readable output that groups semantically related changes together.
func DiffTokensWithPreprocessing(tokens1, tokens2 []string) []Diff {
	// Use diffx histogram diff which handles stopword filtering internally
	return DiffTokens(tokens1, tokens2)
}

// DiffStringsWithPreprocessing tokenizes both strings and computes their diff
// using histogram-based preprocessing that filters confusing tokens.
func DiffStringsWithPreprocessing(text1, text2 string, opts Options) []Diff {
	tokens1 := Tokenize(text1, opts)
	tokens2 := Tokenize(text2, opts)

	if opts.IgnoreCase {
		// For case-insensitive, use lowercased tokens for comparison
		lower1 := make([]string, len(tokens1))
		lower2 := make([]string, len(tokens2))
		for i, t := range tokens1 {
			lower1[i] = strings.ToLower(t)
		}
		for i, t := range tokens2 {
			lower2[i] = strings.ToLower(t)
		}
		// Use preprocessing on lowercased tokens, then map back to original case
		return diffTokensIgnoreCaseWithPreprocessing(tokens1, tokens2, lower1, lower2)
	}
	return DiffTokensWithPreprocessing(tokens1, tokens2)
}

// DiffStringsWithPositionsAndPreprocessing tokenizes and diffs strings using
// histogram-based preprocessing, returning position info for formatting.
// This allows formatters to preserve original spacing for Equal content.
func DiffStringsWithPositionsAndPreprocessing(text1, text2 string, opts Options) DiffResult {
	tokens1, pos1 := TokenizeWithPositions(text1, opts)
	tokens2, pos2 := TokenizeWithPositions(text2, opts)

	var diffs []Diff
	if opts.IgnoreCase {
		// For case-insensitive, use lowercased tokens for comparison
		lower1 := make([]string, len(tokens1))
		lower2 := make([]string, len(tokens2))
		for i, t := range tokens1 {
			lower1[i] = strings.ToLower(t)
		}
		for i, t := range tokens2 {
			lower2[i] = strings.ToLower(t)
		}
		diffs = diffTokensIgnoreCaseWithPreprocessing(tokens1, tokens2, lower1, lower2)
	} else {
		diffs = DiffTokensWithPreprocessing(tokens1, tokens2)
	}

	return DiffResult{
		Diffs:      diffs,
		Text1:      text1,
		Text2:      text2,
		Positions1: pos1,
		Positions2: pos2,
	}
}

// diffTokensIgnoreCaseWithPreprocessing handles case-insensitive diff with preprocessing.
func diffTokensIgnoreCaseWithPreprocessing(tokens1, tokens2, lower1, lower2 []string) []Diff {
	// Filter using lowercase versions
	filtered1, filtered2, map1, map2 := DiscardConfusingTokens(lower1, lower2)

	if len(filtered1) == 0 && len(filtered2) == 0 {
		return diffTokensIgnoreCase(tokens1, tokens2)
	}

	// Diff filtered lowercase tokens
	filteredDiffs := DiffTokens(filtered1, filtered2)

	// Expand back using original case tokens
	expandedDiffs := expandFilteredDiffsWithCase(filteredDiffs, tokens1, tokens2, lower1, lower2, map1, map2)

	return ShiftBoundaries(expandedDiffs)
}

// expandFilteredDiffsWithCase expands filtered diffs preserving original case.
func expandFilteredDiffsWithCase(filteredDiffs []Diff, tokens1, tokens2, lower1, lower2 []string, map1, map2 []int) []Diff {
	result := make([]Diff, 0, len(tokens1)+len(tokens2))

	origIdx1 := 0
	origIdx2 := 0
	filtIdx1 := 0
	filtIdx2 := 0

	for _, d := range filteredDiffs {
		switch d.Type {
		case Equal:
			targetOrig1 := map1[filtIdx1]
			targetOrig2 := map2[filtIdx2]

			for origIdx1 < targetOrig1 {
				result = append(result, Diff{Type: Delete, Token: tokens1[origIdx1]})
				origIdx1++
			}
			for origIdx2 < targetOrig2 {
				result = append(result, Diff{Type: Insert, Token: tokens2[origIdx2]})
				origIdx2++
			}

			// Use token from tokens2 (new file) for Equal
			result = append(result, Diff{Type: Equal, Token: tokens2[origIdx2]})
			origIdx1++
			origIdx2++
			filtIdx1++
			filtIdx2++

		case Delete:
			targetOrig1 := map1[filtIdx1]
			for origIdx1 < targetOrig1 {
				result = append(result, Diff{Type: Delete, Token: tokens1[origIdx1]})
				origIdx1++
			}
			result = append(result, Diff{Type: Delete, Token: tokens1[origIdx1]})
			origIdx1++
			filtIdx1++

		case Insert:
			targetOrig2 := map2[filtIdx2]
			for origIdx2 < targetOrig2 {
				result = append(result, Diff{Type: Insert, Token: tokens2[origIdx2]})
				origIdx2++
			}
			result = append(result, Diff{Type: Insert, Token: tokens2[origIdx2]})
			origIdx2++
			filtIdx2++
		}
	}

	for origIdx1 < len(tokens1) {
		result = append(result, Diff{Type: Delete, Token: tokens1[origIdx1]})
		origIdx1++
	}
	for origIdx2 < len(tokens2) {
		result = append(result, Diff{Type: Insert, Token: tokens2[origIdx2]})
		origIdx2++
	}

	return result
}