$argv
$option[0]
$option[1]
$option[1]
$option[1]
$option[1]
$option[1]
+ $option[1]
+ $option[1]
$directories
diff --git a/src/CLI/Application.php b/src/CLI/Application.php
index 15b977cc..5a4a6ebd 100644
--- a/src/CLI/Application.php
+++ b/src/CLI/Application.php
@@ -12,9 +12,13 @@
use const PHP_EOL;
use function count;
use function printf;
+use Exception;
use SebastianBergmann\FileIterator\Facade;
use SebastianBergmann\PHPCPD\Detector\Detector;
+use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy;
use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy;
+use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration;
+use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy;
use SebastianBergmann\PHPCPD\Log\PMD;
use SebastianBergmann\PHPCPD\Log\Text;
use SebastianBergmann\Timer\ResourceUsageFormatter;
@@ -62,17 +66,14 @@ public function run(array $argv): int
return 1;
}
- $strategy = new DefaultStrategy;
+ $config = new StrategyConfiguration($arguments);
+
+ $strategy = $this->pickStrategy($arguments->algorithm(), $config);
$timer = new Timer;
$timer->start();
- $clones = (new Detector($strategy))->copyPasteDetection(
- $files,
- $arguments->linesThreshold(),
- $arguments->tokensThreshold(),
- $arguments->fuzzy()
- );
+ $clones = (new Detector($strategy))->copyPasteDetection($files);
(new Text)->printResult($clones, $arguments->verbose());
@@ -93,6 +94,21 @@ private function printVersion(): void
);
}
+ private function pickStrategy(?string $algorithm, StrategyConfiguration $config): AbstractStrategy
+ {
+ switch ($algorithm) {
+ case null:
+ case 'rabin-karp':
+ return new DefaultStrategy($config);
+
+ case 'suffixtree':
+ return new SuffixTreeStrategy($config);
+
+ default:
+ throw new Exception('Unsupported algorithm: ' . $algorithm);
+ }
+ }
+
private function help(): void
{
print <<<'EOT'
@@ -108,9 +124,12 @@ private function help(): void
Options for analysing files:
- --fuzzy Fuzz variable names
- --min-lines
+ * This only word correctly if the given word is closed using a sentinel
+ * character.
+ *
+ * @param AbstractToken[] $word List of tokens to analyze
+ */
+ public function __construct(array $word)
+ {
+ parent::__construct($word);
+
+ $arr = array_fill(0, $this->MAX_LENGTH, 0);
+ $this->edBuffer = array_fill(0, $this->MAX_LENGTH, $arr);
+ $this->ensureChildLists();
+ $this->leafCount = array_fill(0, $this->numNodes, 0);
+ $this->initLeafCount(0);
+ }
+
+ /**
+ * @todo Add options:
+ * --min-tokens
+ * --min-lines
+ * --edit-distance
+ * @todo Possibly add consumer from original code.
+ */
+
+ /**
+ * Finds all clones in the string (List) used in the constructor.
+ *
+ * @param int $minLength the minimal length of a clone in tokens (not lines)
+ * @param int $maxErrors the maximal number of errors/gaps allowed
+ * @param int $headEquality the number of elements which have to be the same at the beginning of a clone
+ *
+ * @return CloneInfo[]
+ */
+ public function findClones(int $minLength, int $maxErrors, int $headEquality): array
+ {
+ $this->minLength = $minLength;
+ $this->headEquality = $headEquality;
+ $this->cloneInfos = [];
+
+ for ($i = 0; $i < count($this->word); $i++) {
+ // Do quick start, as first character has to match anyway.
+ $node = $this->nextNode->get(0, $this->word[$i]);
+
+ if ($node < 0 || $this->leafCount[$node] <= 1) {
+ continue;
+ }
+
+ // we know that we have an exact match of at least 'length'
+ // characters, as the word itself is part of the suffix tree.
+ $length = $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node];
+ $numReported = 0;
+
+ for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) {
+ if ($this->matchWord(
+ $i,
+ $i + $length,
+ $this->nodeChildNode[$e],
+ $length,
+ $maxErrors
+ )) {
+ $numReported++;
+ }
+ }
+
+ if ($length >= $this->minLength && $numReported != 1) {
+ $this->reportClone($i, $i + $length, $node, $length, $length);
+ }
+ }
+
+ $map = [];
+
+ for ($index = 0; $index <= count($this->word); $index++) {
+ /** @var CloneInfo[] */
+ $existingClones = $this->cloneInfos[$index] ?? null;
+
+ if (!empty($existingClones)) {
+ foreach ($existingClones as $ci) {
+ // length = number of tokens
+ // TODO: min token length
+ if ($ci->length > $minLength) {
+ $previousCi = $map[$ci->token->line] ?? null;
+
+ if ($previousCi === null) {
+ $map[$ci->token->line] = $ci;
+ } elseif ($ci->length > $previousCi->length) {
+ $map[$ci->token->line] = $ci;
+ }
+ }
+ }
+ }
+ }
+
+ /** @var CloneInfo[] */
+ $values = array_values($map);
+ usort($values, static function (CloneInfo $a, CloneInfo $b): int {
+ return $b->length - $a->length;
+ });
+
+ return $values;
+ }
+
+ /**
+ * This should return true, if the provided character is not allowed to
+ * match with anything else (e.g. is a sentinel).
+ */
+ protected function mayNotMatch(AbstractToken $token): bool
+ {
+ return $token instanceof Sentinel;
+ }
+
+ /**
+ * This method is called whenever the {@link #MAX_LENGTH} is to small and
+ * hence the {@link #edBuffer} was not large enough. This may cause that a
+ * really large clone is reported in multiple chunks of size
+ * {@link #MAX_LENGTH} and potentially minor parts of such a clone might be
+ * lost.
+ */
+ protected function reportBufferShortage(int $leafStart, int $leafLength): void
+ {
+ print 'Encountered buffer shortage: ' . $leafStart . ' ' . $leafLength . "\n";
+ }
+
+ /**
+ * Initializes the {@link #leafCount} array which given for each node the
+ * number of leaves reachable from it (where leaves obtain a value of 1).
+ */
+ private function initLeafCount(int $node): void
+ {
+ $this->leafCount[$node] = 0;
+
+ for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) {
+ $this->initLeafCount($this->nodeChildNode[$e]);
+ $this->leafCount[$node] += $this->leafCount[$this->nodeChildNode[$e]];
+ }
+
+ if ($this->leafCount[$node] == 0) {
+ $this->leafCount[$node] = 1;
+ }
+ }
+
+ /**
+ * Performs the approximative matching between the input word and the tree.
+ *
+ * @param int $wordStart the start position of the currently matched word (position in
+ * the input word)
+ * @param int $wordPosition the current position along the input word
+ * @param int $node the node we are currently at (i.e. the edge leading to this
+ * node is relevant to us).
+ * @param int $nodeWordLength the length of the word found along the nodes (this may be
+ * different from the length along the input word due to gaps)
+ * @param int $maxErrors the number of errors still allowed
+ *
+ * @return bool whether some clone was reported
+ */
+ private function matchWord(int $wordStart, int $wordPosition, int $node, int $nodeWordLength, int $maxErrors)
+ {
+ // We are aware that this method is longer than desirable for code
+ // reading. However, we currently do not see a refactoring that has a
+ // sensible cost-benefit ratio. Suggestions are welcome!
+
+ // self match?
+ if ($this->leafCount[$node] == 1 && $this->nodeWordBegin[$node] == $wordPosition) {
+ return false;
+ }
+
+ $currentNodeWordLength = min($this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $this->MAX_LENGTH - 1);
+
+ // Do min edit distance
+ $currentLength = $this->calculateMaxLength(
+ $wordStart,
+ $wordPosition,
+ $node,
+ $maxErrors,
+ $currentNodeWordLength
+ );
+
+ if ($currentLength == 0) {
+ return false;
+ }
+
+ if ($currentLength >= $this->MAX_LENGTH - 1) {
+ $this->reportBufferShortage($this->nodeWordBegin[$node], $currentNodeWordLength);
+ }
+
+ // calculate cheapest match
+ $best = $maxErrors + 42;
+ $iBest = 0;
+ $jBest = 0;
+
+ for ($k = 0; $k <= $currentLength; $k++) {
+ $i = $currentLength - $k;
+ $j = $currentLength;
+
+ if ($this->edBuffer[$i][$j] < $best) {
+ $best = $this->edBuffer[$i][$j];
+ $iBest = $i;
+ $jBest = $j;
+ }
+
+ $i = $currentLength;
+ $j = $currentLength - $k;
+
+ if ($this->edBuffer[$i][$j] < $best) {
+ $best = $this->edBuffer[$i][$j];
+ $iBest = $i;
+ $jBest = $j;
+ }
+ }
+
+ while ($wordPosition + $iBest < count($this->word) &&
+ $jBest < $currentNodeWordLength &&
+ $this->word[$wordPosition + $iBest] != $this->word[$this->nodeWordBegin[$node] + $jBest] &&
+ $this->word[$wordPosition + $iBest]->equals(
+ $this->word[$this->nodeWordBegin[$node] + $jBest]
+ )) {
+ $iBest++;
+ $jBest++;
+ }
+
+ $numReported = 0;
+
+ if ($currentLength == $currentNodeWordLength) {
+ // we may proceed
+ for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) {
+ if ($this->matchWord(
+ $wordStart,
+ $wordPosition + $iBest,
+ $this->nodeChildNode[$e],
+ $nodeWordLength + $jBest,
+ $maxErrors
+ - $best
+ )) {
+ $numReported++;
+ }
+ }
+ }
+
+ // do not report locally if had reports in exactly one subtree (would be
+ // pure subclone)
+ if ($numReported == 1) {
+ return true;
+ }
+
+ // disallow tail changes
+ while ($iBest > 0 &&
+ $jBest > 0 &&
+ !$this->word[$wordPosition + $iBest - 1]->equals(
+ $this->word[$this->nodeWordBegin[$node] + $jBest - 1]
+ )) {
+ if ($iBest > 1 &&
+ $this->word[$wordPosition + $iBest - 2]->equals(
+ $this->word[$this->nodeWordBegin[$node] + $jBest - 1]
+ )) {
+ $iBest--;
+ } elseif ($jBest > 1 &&
+ $this->word[$wordPosition + $iBest - 1]->equals(
+ $this->word[$this->nodeWordBegin[$node] + $jBest - 2]
+ )) {
+ $jBest--;
+ } else {
+ $iBest--;
+ $jBest--;
+ }
+ }
+
+ // report if real clone
+ if ($iBest > 0 && $jBest > 0) {
+ $numReported++;
+ $this->reportClone($wordStart, $wordPosition + $iBest, $node, $jBest, $nodeWordLength + $jBest);
+ }
+
+ return $numReported > 0;
+ }
+
+ /**
+ * Calculates the maximum length we may take along the word to the current
+ * $node (respecting the number of errors to make). *.
+ *
+ * @param int $wordStart the start position of the currently matched word (position in
+ * the input word)
+ * @param int $wordPosition the current position along the input word
+ * @param int $node the node we are currently at (i.e. the edge leading to this
+ * node is relevant to us).
+ * @param int $maxErrors the number of errors still allowed
+ * @param int $currentNodeWordLength the length of the word found along the nodes (this may be
+ * different from the actual length due to buffer limits)
+ *
+ * @return int the maximal length that can be taken
+ */
+ private function calculateMaxLength(
+ int $wordStart,
+ int $wordPosition,
+ int $node,
+ int $maxErrors,
+ int $currentNodeWordLength
+ ) {
+ $this->edBuffer[0][0] = 0;
+ $currentLength = 1;
+
+ for (; $currentLength <= $currentNodeWordLength; $currentLength++) {
+ /** @var int */
+ $best = $currentLength;
+ $this->edBuffer[0][$currentLength] = $currentLength;
+ $this->edBuffer[$currentLength][0] = $currentLength;
+
+ if ($wordPosition + $currentLength >= count($this->word)) {
+ break;
+ }
+
+ // deal with case that character may not be matched (sentinel!)
+ $iChar = $this->word[$wordPosition + $currentLength - 1];
+ $jChar = $this->word[$this->nodeWordBegin[$node] + $currentLength - 1];
+
+ if ($this->mayNotMatch($iChar) || $this->mayNotMatch($jChar)) {
+ break;
+ }
+
+ // usual matrix completion for edit distance
+ for ($k = 1; $k < $currentLength; $k++) {
+ $best = min(
+ $best,
+ $this->fillEDBuffer(
+ $k,
+ $currentLength,
+ $wordPosition,
+ $this->nodeWordBegin[$node]
+ )
+ );
+ }
+
+ for ($k = 1; $k < $currentLength; $k++) {
+ $best = min(
+ $best,
+ $this->fillEDBuffer(
+ $currentLength,
+ $k,
+ $wordPosition,
+ $this->nodeWordBegin[$node]
+ )
+ );
+ }
+ $best = min(
+ $best,
+ $this->fillEDBuffer(
+ $currentLength,
+ $currentLength,
+ $wordPosition,
+ $this->nodeWordBegin[$node]
+ )
+ );
+
+ if ($best > $maxErrors ||
+ $wordPosition - $wordStart + $currentLength <= $this->headEquality &&
+ $best > 0) {
+ break;
+ }
+ }
+ $currentLength--;
+
+ return $currentLength;
+ }
+
+ private function reportClone(
+ int $wordBegin,
+ int $wordEnd,
+ int $currentNode,
+ int $nodeWordPos,
+ int $nodeWordLength
+ ): void {
+ $length = $wordEnd - $wordBegin;
+
+ if ($length < $this->minLength || $nodeWordLength < $this->minLength) {
+ return;
+ }
+
+ // NB: 0 and 0 are two indicate the template S and T for Psalm, in lack of generics.
+ $otherClones = new PairList(16, 0, 0);
+ $this->findRemainingClones(
+ $otherClones,
+ $nodeWordLength,
+ $currentNode,
+ $this->nodeWordEnd[$currentNode] - $this->nodeWordBegin[$currentNode] - $nodeWordPos,
+ $wordBegin
+ );
+
+ $occurrences = 1 + $otherClones->size();
+
+ // check whether we may start from here
+ $t = $this->word[$wordBegin];
+ $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones);
+
+ for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; $index++) {
+ $existingClones = $this->cloneInfos[$index] ?? null;
+
+ if ($existingClones != null) {
+ //for (CloneInfo cloneInfo : $existingClones) {
+ foreach ($existingClones as $cloneInfo) {
+ if ($cloneInfo->dominates($newInfo, $wordBegin - $index)) {
+ // we already have a dominating clone, so ignore
+ return;
+ }
+ }
+ }
+ }
+
+ // add clone to $otherClones to avoid getting more duplicates
+ for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) {
+ $this->cloneInfos[$i][] = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones);
+ }
+ $t = $this->word[$wordBegin];
+
+ for ($clone = 0; $clone < $otherClones->size(); $clone++) {
+ $start = $otherClones->getFirst($clone);
+ $otherLength = $otherClones->getSecond($clone);
+
+ for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) {
+ $this->cloneInfos[$start + $i][] = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones);
+ }
+ }
+ }
+
+ /**
+ * Fills the edit distance buffer at position (i,j).
+ *
+ * @param int $i the first index of the buffer
+ * @param int $j the second index of the buffer
+ * @param int $iOffset the offset where the word described by $i starts
+ * @param int $jOffset the offset where the word described by $j starts
+ *
+ * @return int the value inserted into the buffer
+ */
+ private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset)
+ {
+ $iChar = $this->word[$iOffset + $i - 1];
+ $jChar = $this->word[$jOffset + $j - 1];
+
+ $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]);
+ $change = $this->edBuffer[$i - 1][$j - 1] + ($iChar->equals($jChar) ? 0 : 1);
+
+ return $this->edBuffer[$i][$j] = min($insertDelete, $change);
+ }
+
+ /**
+ * Fills a list of pairs giving the start positions and lengths of the
+ * remaining clones.
+ *
+ * @param PairList $clonePositions the clone positions being filled (start position and length)
+ * @param int $nodeWordLength the length of the word along the nodes
+ * @param int $currentNode the node we are currently at
+ * @param int $distance the distance along the word leading to the current node
+ * @param int $wordStart the start of the currently searched word
+ */
+ private function findRemainingClones(
+ PairList $clonePositions,
+ int $nodeWordLength,
+ int $currentNode,
+ int $distance,
+ int $wordStart
+ ): void {
+ for ($nextNode = $this->nodeChildFirst[$currentNode]; $nextNode >= 0; $nextNode = $this->nodeChildNext[$nextNode]) {
+ $node = $this->nodeChildNode[$nextNode];
+ $this->findRemainingClones($clonePositions, $nodeWordLength, $node, $distance
+ + $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $wordStart);
+ }
+
+ if ($this->nodeChildFirst[$currentNode] < 0) {
+ $start = count($this->word) - $distance - $nodeWordLength;
+
+ if ($start != $wordStart) {
+ $clonePositions->add($start, $nodeWordLength);
+ }
+ }
+ }
+}
diff --git a/src/Detector/Strategy/SuffixTree/CloneInfo.php b/src/Detector/Strategy/SuffixTree/CloneInfo.php
new file mode 100644
index 00000000..4187996c
--- /dev/null
+++ b/src/Detector/Strategy/SuffixTree/CloneInfo.php
@@ -0,0 +1,68 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree;
+
+/** Stores information on a clone. */
+class CloneInfo
+{
+ /**
+ * Length of the clone in tokens.
+ *
+ * @var int
+ */
+ public $length;
+
+ /**
+ * Position in word list.
+ *
+ * @var int
+ */
+ public $position;
+
+ /**
+ * @var AbstractToken
+ */
+ public $token;
+
+ /**
+ * Related clones.
+ *
+ * @var PairList
+ */
+ public $otherClones;
+
+ /**
+ * Number of occurrences of the clone.
+ *
+ * @var int
+ */
+ private $occurrences;
+
+ /** Constructor. */
+ public function __construct(int $length, int $position, int $occurrences, AbstractToken $token, PairList $otherClones)
+ {
+ $this->length = $length;
+ $this->position = $position;
+ $this->occurrences = $occurrences;
+ $this->token = $token;
+ $this->otherClones = $otherClones;
+ }
+
+ /**
+ * Returns whether this clone info dominates the given one, i.e. whether
+ * both {@link #length} and {@link #occurrences} s not smaller.
+ *
+ * @param later the amount the given clone starts later than the "this" clone
+ */
+ public function dominates(self $ci, int $later): bool
+ {
+ return $this->length - $later >= $ci->length && $this->occurrences >= $ci->occurrences;
+ }
+}
diff --git a/src/Detector/Strategy/SuffixTree/PairList.php b/src/Detector/Strategy/SuffixTree/PairList.php
new file mode 100644
index 00000000..c0b851ec
--- /dev/null
+++ b/src/Detector/Strategy/SuffixTree/PairList.php
@@ -0,0 +1,227 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree;
+
+use Exception;
+
+/**
+ * A list for storing pairs in a specific order.
+ *
+ * @author $Author: hummelb $
+ *
+ * @version $Rev: 51770 $
+ * @ConQAT.Rating GREEN Hash: 7459D6D0F59028B37DD23DD091BDCEEA
+ *
+ * @template T
+ * @template S
+ */
+class PairList
+{
+ /**
+ * Version used for serialization.
+ *
+ * @var int
+ */
+ private $serialVersionUID = 1;
+
+ /**
+ * The current size.
+ *
+ * @var int
+ */
+ private $size = 0;
+
+ /**
+ * The array used for storing the S.
+ *
+ * @var S[]
+ */
+ private $firstElements;
+
+ /**
+ * The array used for storing the T.
+ *
+ * @var T[]
+ */
+ private $secondElements;
+
+ /**
+ * @param S $firstType
+ * @param T $secondType
+ */
+ public function __construct(int $initialCapacity, $firstType, $secondType)
+ {
+ if ($initialCapacity < 1) {
+ $initialCapacity = 1;
+ }
+ $this->firstElements = array_fill(0, $initialCapacity, null);
+ $this->secondElements = array_fill(0, $initialCapacity, null);
+ }
+
+ /** Returns whether the list is empty. */
+ public function isEmpty(): bool
+ {
+ return $this->size == 0;
+ }
+
+ /** Returns the size of the list. */
+ public function size(): int
+ {
+ return $this->size;
+ }
+
+ /**
+ * Add the given pair to the list.
+ *
+ * @param S $first
+ * @param T $second
+ */
+ public function add($first, $second): void
+ {
+ $this->firstElements[$this->size] = $first;
+ $this->secondElements[$this->size] = $second;
+ $this->size++;
+ }
+
+ /** Adds all pairs from another list. */
+ public function addAll(self $other): void
+ {
+ // we have to store this in a local var, as other.$this->size may change if
+ // other == this
+ $otherSize = $other->size;
+
+ for ($i = 0; $i < $otherSize; $i++) {
+ $this->firstElements[$this->size] = $other->firstElements[$i];
+ $this->secondElements[$this->size] = $other->secondElements[$i];
+ $this->size++;
+ }
+ }
+
+ /**
+ * Returns the first element at given index.
+ *
+ * @return S
+ */
+ public function getFirst(int $i)
+ {
+ $this->checkWithinBounds($i);
+
+ return $this->firstElements[$i];
+ }
+
+ /**
+ * Sets the first element at given index.
+ *
+ * @param S $value
+ */
+ public function setFirst(int $i, $value): void
+ {
+ $this->checkWithinBounds($i);
+ $this->firstElements[$i] = $value;
+ }
+
+ /**
+ * Returns the second element at given index.
+ *
+ * @return T
+ */
+ public function getSecond(int $i)
+ {
+ $this->checkWithinBounds($i);
+
+ return $this->secondElements[$i];
+ }
+
+ /**
+ * Sets the first element at given index.
+ *
+ * @param T $value
+ */
+ public function setSecond(int $i, $value): void
+ {
+ $this->checkWithinBounds($i);
+ $this->secondElements[$i] = $value;
+ }
+
+ /**
+ * Creates a new list containing all first elements.
+ *
+ * @return S[]
+ */
+ public function extractFirstList(): array
+ {
+ $result = [];
+
+ for ($i = 0; $i < $this->size; $i++) {
+ $result[] = $this->firstElements[$i];
+ }
+
+ return $result;
+ }
+
+ /**
+ * Creates a new list containing all second elements.
+ *
+ * @return T[]
+ */
+ public function extractSecondList(): array
+ {
+ $result = [];
+
+ for ($i = 0; $i < $this->size; $i++) {
+ $result[] = $this->secondElements[$i];
+ }
+
+ return $result;
+ }
+
+ /** Swaps the entries located at indexes $i and $j. */
+ public function swapEntries(int $i, int $j): void
+ {
+ $tmp1 = $this->getFirst($i);
+ $tmp2 = $this->getSecond($i);
+ $this->setFirst($i, $this->getFirst($j));
+ $this->setSecond($i, $this->getSecond($j));
+ $this->setFirst($j, $tmp1);
+ $this->setSecond($j, $tmp2);
+ }
+
+ /** Clears this list. */
+ public function clear(): void
+ {
+ $this->size = 0;
+ }
+
+ /** Removes the last element of the list. */
+ public function removeLast(): void
+ {
+ $this->size--;
+ }
+
+ public function hashCode(): int
+ {
+ $prime = 31;
+ $hash = $this->size;
+ $hash = $prime * $hash + crc32(serialize($this->firstElements));
+
+ return $prime * $hash + crc32(serialize($this->secondElements));
+ }
+
+ /**
+ * Checks whether the given
+ * We use some conventions which are slightly different from the paper however:
+ *
+ * Everything but the construction itself is protected to simplify increasing
+ * its functionality by subclassing but without introducing new method calls.
+ *
+ * @author Benjamin Hummel
+ * @author $Author: kinnen $
+ *
+ * @version $Revision: 41751 $
+ * @ConQAT.Rating GREEN Hash: 4B2EF0606B3085A6831764ED042FF20D
+ */
+class SuffixTree
+{
+ /**
+ * Infinity in this context.
+ *
+ * @var int
+ */
+ protected $INFTY;
+
+ /**
+ * The word we are working on.
+ *
+ * @var AbstractToken[]
+ */
+ protected $word;
+
+ /**
+ * The number of nodes created so far.
+ *
+ * @var int
+ */
+ protected $numNodes = 0;
+
+ /**
+ * For each node this holds the index of the first character of
+ * {@link #word} labeling the transition to this node. This
+ * corresponds to the k for a transition used in Ukkonen's paper.
+ *
+ * @var int[]
+ */
+ protected $nodeWordBegin;
+
+ /**
+ * For each node this holds the index of the one after the last character of
+ * {@link #word} labeling the transition to this node. This
+ * corresponds to the p for a transition used in Ukkonen's paper.
+ *
+ * @var int[]
+ */
+ protected $nodeWordEnd;
+
+ /** For each node its suffix link (called function f by Ukkonen).
+ * @var int[] */
+ protected $suffixLink;
+
+ /**
+ * The next node function realized as a hash table. This corresponds to the
+ * g function used in Ukkonen's paper.
+ *
+ * @var SuffixTreeHashTable
+ */
+ protected $nextNode;
+
+ /**
+ * An array giving for each node the index where the first child will be
+ * stored (or -1 if it has no children). It is initially empty and will be
+ * filled "on demand" using
+ * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])}
+ * .
+ *
+ * @var int[]
+ */
+ protected $nodeChildFirst = [];
+
+ /**
+ * This array gives the next index of the child list or -1 if this is the
+ * last one. It is initially empty and will be filled "on demand" using
+ * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])}
+ * .
+ *
+ * @var int[]
+ */
+ protected $nodeChildNext = [];
+
+ /**
+ * This array stores the actual name (=number) of the mode in the child
+ * list. It is initially empty and will be filled "on demand" using
+ * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])}
+ * .
+ *
+ * @var int[]
+ */
+ protected $nodeChildNode = [];
+
+ /**
+ * The node we are currently at as a "global" variable (as it is always
+ * passed unchanged). This is called s in Ukkonen's paper.
+ *
+ * @var int
+ */
+ private $currentNode = 0;
+
+ /**
+ * Beginning of the word part of the reference pair. This is kept "global"
+ * (in constrast to the end) as this is passed unchanged to all functions.
+ * Ukkonen calls this k.
+ *
+ * @var int
+ */
+ private $refWordBegin = 0;
+
+ /**
+ * This is the new (or old) explicit state as returned by
+ * {@link #testAndSplit(int, Object)}. Ukkonen calls this r.
+ *
+ * @var int
+ */
+ private $explicitNode = 0;
+
+ /**
+ * Create a new suffix tree from a given word. The word given as parameter
+ * is used internally and should not be modified anymore, so copy it before
+ * if required.
+ *
+ * @param AbstractToken[] $word
+ */
+ public function __construct($word)
+ {
+ $this->word = $word;
+ $size = count($word);
+ $this->INFTY = $size;
+
+ $expectedNodes = 2 * $size;
+ $this->nodeWordBegin = array_fill(0, $expectedNodes, 0);
+ $this->nodeWordEnd = array_fill(0, $expectedNodes, 0);
+ $this->suffixLink = array_fill(0, $expectedNodes, 0);
+ $this->nextNode = new SuffixTreeHashTable($expectedNodes);
+
+ $this->createRootNode();
+
+ for ($i = 0; $i < $size; $i++) {
+ $this->update($i);
+ $this->canonize($i + 1);
+ }
+ }
+
+ /**
+ * This method makes sure the child lists are filled (required for
+ * traversing the tree).
+ */
+ protected function ensureChildLists(): void
+ {
+ if ($this->nodeChildFirst == null || count($this->nodeChildFirst) < $this->numNodes) {
+ $this->nodeChildFirst = array_fill(0, $this->numNodes, 0);
+ $this->nodeChildNext = array_fill(0, $this->numNodes, 0);
+ $this->nodeChildNode = array_fill(0, $this->numNodes, 0);
+ $this->nextNode->extractChildLists($this->nodeChildFirst, $this->nodeChildNext, $this->nodeChildNode);
+ }
+ }
+
+ /**
+ * Creates the root node.
+ */
+ private function createRootNode(): void
+ {
+ $this->numNodes = 1;
+ $this->nodeWordBegin[0] = 0;
+ $this->nodeWordEnd[0] = 0;
+ $this->suffixLink[0] = -1;
+ }
+
+ /**
+ * The update function as defined in Ukkonen's paper. This inserts
+ * the character at charPos into the tree. It works on the canonical
+ * reference pair ({@link #currentNode}, ({@link #refWordBegin}, charPos)).
+ */
+ private function update(int $charPos): void
+ {
+ $lastNode = 0;
+
+ while (!$this->testAndSplit($charPos, $this->word[$charPos])) {
+ $newNode = $this->numNodes++;
+ $this->nodeWordBegin[$newNode] = $charPos;
+ $this->nodeWordEnd[$newNode] = $this->INFTY;
+ $this->nextNode->put($this->explicitNode, $this->word[$charPos], $newNode);
+
+ if ($lastNode != 0) {
+ $this->suffixLink[$lastNode] = $this->explicitNode;
+ }
+ $lastNode = $this->explicitNode;
+ $this->currentNode = $this->suffixLink[$this->currentNode];
+ $this->canonize($charPos);
+ }
+
+ if ($lastNode != 0) {
+ $this->suffixLink[$lastNode] = $this->currentNode;
+ }
+ }
+
+ /**
+ * The test-and-split function as defined in Ukkonen's paper. This
+ * checks whether the state given by the canonical reference pair (
+ * {@link #currentNode}, ({@link #refWordBegin}, refWordEnd)) is the end
+ * point (by checking whether a transition for the
+ *
+ * It hashes from (node, character) pairs to the next node, where nodes are
+ * represented by integers and the type of characters is determined by the
+ * generic parameter.
+ *
+ * @author Benjamin Hummel
+ * @author $Author: juergens $
+ *
+ * @version $Revision: 34670 $
+ * @ConQAT.Rating GREEN Hash: 6A7A830078AF0CA9C2D84C148F336DF4
+ */
+class SuffixTreeHashTable
+{
+ /**
+ * These numbers were taken from
+ * http://planetmath.org/encyclopedia/GoodHashTablePrimes.html.
+ *
+ * @var int[]
+ */
+ private $allowedSizes = [53, 97, 193, 389, 769, 1543,
+ 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433,
+ 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319,
+ 201326611, 402653189, 805306457, 1610612741, ];
+
+ /**
+ * The size of the hash table.
+ *
+ * @var int
+ */
+ private $tableSize;
+
+ /**
+ * Storage space for the node part of the key.
+ *
+ * @var int[]
+ */
+ private $keyNodes;
+
+ /**
+ * Storage space for the character part of the key.
+ *
+ * @var array
+ * The method is package visible, as it is tighly coupled to the
+ * {@link SuffixTree} class.
+ *
+ * @param int[] $nodeFirstIndex an array giving for each node the index where the first child
+ * will be stored (or -1 if it has no children)
+ * @param int[] $nodeNextIndex this array gives the next index of the child list or -1 if
+ * this is the last one
+ * @param int[] $nodeChild this array stores the actual name (=number) of the mode in the
+ * child list
+ */
+ public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, array &$nodeChild): void
+ {
+ // Instead of Arrays.fill($nodeFirstIndex, -1);
+ foreach (array_keys($nodeFirstIndex) as $k) {
+ $nodeFirstIndex[$k] = -1;
+ }
+ $free = 0;
+
+ for ($i = 0; $i < $this->tableSize; $i++) {
+ if ($this->keyChars[$i] !== null) {
+ // insert $this->keyNodes[$i] -> $this->resultNodes[$i]
+ $nodeChild[$free] = $this->resultNodes[$i];
+ $nodeNextIndex[$free] = $nodeFirstIndex[$this->keyNodes[$i]];
+ $nodeFirstIndex[$this->keyNodes[$i]] = $free++;
+ }
+ }
+ }
+
+ /**
+ * Returns the position of the (node,char) key in the hash map or the
+ * position to insert it into if it is not yet in.
+ */
+ private function hashFind(int $keyNode, AbstractToken $keyChar): int
+ {
+ $this->_numFind++;
+ $hash = $keyChar->hashCode();
+ $pos = $this->posMod($this->primaryHash($keyNode, $hash));
+ $secondary = $this->secondaryHash($keyNode, $hash);
+
+ while ($this->keyChars[$pos] !== null) {
+ if ($this->keyNodes[$pos] === $keyNode && $keyChar->equals($this->keyChars[$pos])) {
+ break;
+ }
+ $this->_numColl++;
+ $pos = ($pos + $secondary) % $this->tableSize;
+ }
+
+ return $pos;
+ }
+
+ /**
+ * Returns the primary hash value for a (node, character) key pair.
+ */
+ private function primaryHash(int $keyNode, int $keyCharHash): int
+ {
+ return $keyCharHash ^ (13 * $keyNode);
+ }
+
+ /**
+ * Returns the secondary hash value for a (node, character) key pair.
+ */
+ private function secondaryHash(int $keyNode, int $keyCharHash): int
+ {
+ $result = $this->posMod(($keyCharHash ^ (1025 * $keyNode)));
+
+ if ($result == 0) {
+ return 2;
+ }
+
+ return $result;
+ }
+
+ /**
+ * Returns the smallest non-negative number congruent to x modulo
+ * {@link #tableSize}.
+ */
+ private function posMod(int $x): int
+ {
+ $x %= $this->tableSize;
+
+ if ($x < 0) {
+ $x += $this->tableSize;
+ }
+
+ return $x;
+ }
+}
diff --git a/src/Detector/Strategy/SuffixTree/Token.php b/src/Detector/Strategy/SuffixTree/Token.php
new file mode 100644
index 00000000..d34f9074
--- /dev/null
+++ b/src/Detector/Strategy/SuffixTree/Token.php
@@ -0,0 +1,42 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree;
+
+class Token extends AbstractToken
+{
+ public function __construct(
+ int $tokenCode,
+ string $tokenName,
+ int $line,
+ string $file,
+ string $content
+ ) {
+ $this->tokenCode = $tokenCode;
+ $this->tokenName = $tokenName;
+ $this->line = $line;
+ $this->content = $content;
+ $this->file = $file;
+ }
+
+ public function __toString(): string
+ {
+ return $this->tokenName;
+ }
+
+ public function hashCode(): int
+ {
+ return crc32($this->content);
+ }
+
+ public function equals(AbstractToken $token): bool
+ {
+ return $token->hashCode() === $this->hashCode();
+ }
+}
diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php
new file mode 100644
index 00000000..73528e9c
--- /dev/null
+++ b/src/Detector/Strategy/SuffixTreeStrategy.php
@@ -0,0 +1,105 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+namespace SebastianBergmann\PHPCPD\Detector\Strategy;
+
+use function array_keys;
+use function file_get_contents;
+use function is_array;
+use function token_get_all;
+use Exception;
+use SebastianBergmann\PHPCPD\CodeClone;
+use SebastianBergmann\PHPCPD\CodeCloneFile;
+use SebastianBergmann\PHPCPD\CodeCloneMap;
+use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\AbstractToken;
+use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree;
+use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel;
+use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token;
+
+/**
+ * For the design of the algorithm, all credits go to the authors of "Do Code Clones Matter?".
+ */
+final class SuffixTreeStrategy extends AbstractStrategy
+{
+ /**
+ * @var AbstractToken[]
+ */
+ private $word = [];
+
+ /**
+ * @var ?CodeCloneMap
+ */
+ private $result;
+
+ public function processFile(string $file, CodeCloneMap $result): void
+ {
+ $content = file_get_contents($file);
+ $tokens = token_get_all($content);
+
+ foreach (array_keys($tokens) as $key) {
+ $token = $tokens[$key];
+
+ if (is_array($token)) {
+ if (!isset($this->tokensIgnoreList[$token[0]])) {
+ $this->word[] = new Token(
+ $token[0],
+ token_name($token[0]),
+ $token[2],
+ $file,
+ $token[1]
+ );
+ }
+ }
+ }
+
+ $this->result = $result;
+ }
+
+ public function postProcess(): void
+ {
+ if (empty($this->result)) {
+ throw new Exception('Missing result');
+ }
+
+ // Sentinel = End of word
+ $this->word[] = new Sentinel();
+
+ $tree = new ApproximateCloneDetectingSuffixTree($this->word);
+ $cloneInfos = $tree->findClones(
+ $this->config->getMinTokens(),
+ $this->config->getEditDistance(),
+ $this->config->getHeadEquality()
+ );
+
+ foreach ($cloneInfos as $cloneInfo) {
+ /** @var int[] */
+ $others = $cloneInfo->otherClones->extractFirstList();
+
+ for ($j = 0; $j < count($others); $j++) {
+ $otherStart = $others[$j];
+ $t = $this->word[$otherStart];
+ $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length];
+ // If we stumbled upon the Sentinel, rewind one step.
+ if ($lastToken instanceof Sentinel) {
+ $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 2];
+ }
+ $lines = $lastToken->line - $cloneInfo->token->line;
+ $this->result->add(
+ new CodeClone(
+ new CodeCloneFile($cloneInfo->token->file, $cloneInfo->token->line),
+ new CodeCloneFile($t->file, $t->line),
+ $lines,
+ // TODO: Double check this
+ $otherStart + 1 - $cloneInfo->position
+ )
+ );
+ }
+ }
+ }
+}
diff --git a/tests/fixture/editdistance1.php b/tests/fixture/editdistance1.php
new file mode 100644
index 00000000..61a13c3a
--- /dev/null
+++ b/tests/fixture/editdistance1.php
@@ -0,0 +1,27 @@
+question_l10ns->rows->row)) {
+ // Edit difference here.
+ if ($bTranslateLinksFields) {
+ $insertdata['question'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['question']);
+ $insertdata['help'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['help']);
+ }
+ $oQuestionL10n = new QuestionL10n();
+ $oQuestionL10n->question = $insertdata['question'];
+ $oQuestionL10n->help = $insertdata['help'];
+ $oQuestionL10n->language = $insertdata['language'];
+ unset($insertdata['question']);
+ unset($insertdata['help']);
+ unset($insertdata['language']);
+}
+
+// For some reason, two exact files will lead to one 0-line clone.
+$a = 10;
diff --git a/tests/fixture/editdistance2.php b/tests/fixture/editdistance2.php
new file mode 100644
index 00000000..14b44676
--- /dev/null
+++ b/tests/fixture/editdistance2.php
@@ -0,0 +1,24 @@
+question_l10ns->rows->row)) {
+ // Edit difference here.
+ if ($options['translinkfields']) {
+ $insertdata['question'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['question']);
+ $insertdata['help'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['help']);
+ }
+ $oQuestionL10n = new QuestionL10n();
+ $oQuestionL10n->question = $insertdata['question'];
+ $oQuestionL10n->help = $insertdata['help'];
+ $oQuestionL10n->language = $insertdata['language'];
+ unset($insertdata['question']);
+ unset($insertdata['help']);
+ unset($insertdata['language']);
+}
+
+foo();
diff --git a/tests/fixture/type3_clone.php b/tests/fixture/type3_clone.php
new file mode 100644
index 00000000..5557e0bd
--- /dev/null
+++ b/tests/fixture/type3_clone.php
@@ -0,0 +1,40 @@
+ $b) {
+ return 'foo';
+ } else {
+ return 'bar';
+ }
+}
+
+function bar()
+{
+ $a = 10;
+ $b = 20;
+ if ($a > $b) {
+ } else {
+ return 'bar';
+ }
+}
+
+function bar()
+{
+ $a = 10;
+ $b = '20';
+ if ($a) {
+ return 'foo';
+ } else {
+ return 'bar';
+ }
+}
diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php
index c7d61813..fefcb0f8 100644
--- a/tests/unit/DetectorTest.php
+++ b/tests/unit/DetectorTest.php
@@ -13,11 +13,18 @@
use function next;
use function sort;
use PHPUnit\Framework\TestCase;
+use SebastianBergmann\PHPCPD\ArgumentsBuilder;
+use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy;
use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy;
+use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration;
/**
+ * @covers \SebastianBergmann\PHPCPD\Arguments
+ * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder
* @covers \SebastianBergmann\PHPCPD\Detector\Detector
+ * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy
* @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy
+ * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration
*
* @uses \SebastianBergmann\PHPCPD\CodeClone
* @uses \SebastianBergmann\PHPCPD\CodeCloneFile
@@ -28,11 +35,11 @@ final class DetectorTest extends TestCase
/**
* @dataProvider strategyProvider
*
- * @psalm-param class-string $strategy
+ * @psalm-param AbstractStrategy $strategy
*/
- public function testDetectingSimpleClonesWorks(string $strategy): void
+ public function testDetectingSimpleClonesWorks(AbstractStrategy $strategy): void
{
- $clones = (new Detector(new $strategy))->copyPasteDetection(
+ $clones = (new Detector($strategy))->copyPasteDetection(
[__DIR__ . '/../fixture/Math.php']
);
@@ -117,18 +124,19 @@ public function testDetectingSimpleClonesWorks(string $strategy): void
/**
* @dataProvider strategyProvider
- *
- * @psalm-param class-string $strategy
*/
- public function testDetectingExactDuplicateFilesWorks(string $strategy): void
+ public function testDetectingExactDuplicateFilesWorks(AbstractStrategy $strategy): void
{
- $clones = (new Detector(new $strategy))->copyPasteDetection(
+ $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '50'];
+ $arguments = (new ArgumentsBuilder)->build($argv);
+ $config = new StrategyConfiguration($arguments);
+ $strategy->setConfig($config);
+
+ $clones = (new Detector($strategy))->copyPasteDetection(
[
__DIR__ . '/../fixture/a.php',
__DIR__ . '/../fixture/b.php',
- ],
- 20,
- 60
+ ]
);
$clones = $clones->clones();
@@ -149,23 +157,25 @@ public function testDetectingExactDuplicateFilesWorks(string $strategy): void
/**
* @dataProvider strategyProvider
- *
- * @psalm-param class-string $strategy
*/
- public function testDetectingClonesInMoreThanTwoFiles(string $strategy): void
+ public function testDetectingClonesInMoreThanTwoFiles(AbstractStrategy $strategy): void
{
- $clones = (new Detector(new $strategy))->copyPasteDetection(
+ $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '60'];
+ $arguments = (new ArgumentsBuilder)->build($argv);
+ $config = new StrategyConfiguration($arguments);
+ $strategy->setConfig($config);
+
+ $clones = (new Detector($strategy))->copyPasteDetection(
[
__DIR__ . '/../fixture/a.php',
__DIR__ . '/../fixture/b.php',
__DIR__ . '/../fixture/c.php',
- ],
- 20,
- 60
+ ]
);
$clones = $clones->clones();
- $files = $clones[0]->files();
+ //var_dump($clones);
+ $files = $clones[0]->files();
sort($files);
$file = current($files);
@@ -187,18 +197,18 @@ public function testDetectingClonesInMoreThanTwoFiles(string $strategy): void
/**
* @dataProvider strategyProvider
- *
- * @psalm-param class-string $strategy
*/
- public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(string $strategy): void
+ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(AbstractStrategy $strategy): void
{
- $clones = (new Detector(new $strategy))->copyPasteDetection(
+ $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '61'];
+ $arguments = (new ArgumentsBuilder)->build($argv);
+ $config = new StrategyConfiguration($arguments);
+ $strategy->setConfig($config);
+ $clones = (new Detector($strategy))->copyPasteDetection(
[
__DIR__ . '/../fixture/a.php',
__DIR__ . '/../fixture/b.php',
- ],
- 20,
- 61
+ ]
);
$this->assertCount(0, $clones->clones());
@@ -206,18 +216,18 @@ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(string $st
/**
* @dataProvider strategyProvider
- *
- * @psalm-param class-string $strategy
*/
- public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(string $strategy): void
+ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(AbstractStrategy $strategy): void
{
- $clones = (new Detector(new $strategy))->copyPasteDetection(
+ $argv = [1 => '.', '--min-lines', '21', '--min-tokens', '60'];
+ $arguments = (new ArgumentsBuilder)->build($argv);
+ $config = new StrategyConfiguration($arguments);
+ $strategy->setConfig($config);
+ $clones = (new Detector($strategy))->copyPasteDetection(
[
__DIR__ . '/../fixture/a.php',
__DIR__ . '/../fixture/b.php',
- ],
- 21,
- 60
+ ]
);
$this->assertCount(0, $clones->clones());
@@ -225,19 +235,18 @@ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(string $stra
/**
* @dataProvider strategyProvider
- *
- * @psalm-param class-string $strategy
*/
- public function testFuzzyClonesAreFound(string $strategy): void
+ public function testFuzzyClonesAreFound(AbstractStrategy $strategy): void
{
- $clones = (new Detector(new $strategy))->copyPasteDetection(
+ $argv = [1 => '.', '--min-lines', '5', '--min-tokens', '20', '--fuzzy', 'true'];
+ $arguments = (new ArgumentsBuilder)->build($argv);
+ $config = new StrategyConfiguration($arguments);
+ $strategy->setConfig($config);
+ $clones = (new Detector($strategy))->copyPasteDetection(
[
__DIR__ . '/../fixture/a.php',
__DIR__ . '/../fixture/d.php',
- ],
- 5,
- 20,
- true
+ ]
);
$this->assertCount(1, $clones->clones());
@@ -245,25 +254,30 @@ public function testFuzzyClonesAreFound(string $strategy): void
/**
* @dataProvider strategyProvider
- *
- * @psalm-param class-string $strategy
*/
- public function testStripComments(string $strategy): void
+ public function testStripComments(AbstractStrategy $strategy): void
{
- $detector = new Detector(new $strategy);
+ $argv = [1 => '.', '--min-lines', '8', '--min-tokens', '10', '--fuzzy', 'true'];
+ $arguments = (new ArgumentsBuilder)->build($argv);
+ $config = new StrategyConfiguration($arguments);
+ $strategy->setConfig($config);
+
+ $detector = new Detector($strategy);
$clones = $detector->copyPasteDetection(
[
__DIR__ . '/../fixture/e.php',
__DIR__ . '/../fixture/f.php',
- ],
- 8,
- 10,
- true
+ ]
);
$this->assertCount(0, $clones->clones());
+ $argv = [1 => '.', '--min-lines', '7', '--min-tokens', '10', '--fuzzy', 'true'];
+ $arguments = (new ArgumentsBuilder)->build($argv);
+ $config = new StrategyConfiguration($arguments);
+ $strategy->setConfig($config);
+
$clones = $detector->copyPasteDetection(
[
__DIR__ . '/../fixture/e.php',
@@ -278,12 +292,17 @@ public function testStripComments(string $strategy): void
}
/**
- * @psalm-return list$i is within the bounds. Throws an
+ * exception otherwise.
+ */
+ private function checkWithinBounds(int $i): void
+ {
+ if ($i < 0 || $i >= $this->size) {
+ throw new Exception('Out of bounds: ' . $i);
+ }
+ }
+}
diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php
new file mode 100644
index 00000000..ad241485
--- /dev/null
+++ b/src/Detector/Strategy/SuffixTree/Sentinel.php
@@ -0,0 +1,48 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree;
+
+/**
+ * A sentinel character which can be used to produce explicit leaves for all
+ * suffixes. The sentinel just has to be appended to the list before handing
+ * it to the suffix tree. For the sentinel equality and object identity are
+ * the same!
+ */
+class Sentinel extends AbstractToken
+{
+ /** @var int The hash value used. */
+ private $hash;
+
+ public function __construct()
+ {
+ $this->hash = rand(0, PHP_INT_MAX);
+ $this->tokenCode = -1;
+ $this->line = -1;
+ $this->file = '
+ *
+ * nextCharacter exists). Additionally the state is made
+ * explicit if it not already is and this is not the end-point. It returns
+ * true if the end-point was reached. The newly created (or reached)
+ * explicit node is returned in the "global" variable.
+ */
+ private function testAndSplit(int $refWordEnd, AbstractToken $nextCharacter): bool
+ {
+ if ($this->currentNode < 0) {
+ // trap state is always end state
+ return true;
+ }
+
+ if ($refWordEnd <= $this->refWordBegin) {
+ if ($this->nextNode->get($this->currentNode, $nextCharacter) < 0) {
+ $this->explicitNode = $this->currentNode;
+
+ return false;
+ }
+
+ return true;
+ }
+
+ $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]);
+
+ if ($nextCharacter->equals($this->word[$this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin])) {
+ return true;
+ }
+
+ // not an end-point and not explicit, so make it explicit.
+ $this->explicitNode = $this->numNodes++;
+ $this->nodeWordBegin[$this->explicitNode] = $this->nodeWordBegin[$next];
+ $this->nodeWordEnd[$this->explicitNode] = $this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin;
+ $this->nextNode->put($this->currentNode, $this->word[$this->refWordBegin], $this->explicitNode);
+
+ $this->nodeWordBegin[$next] += $refWordEnd - $this->refWordBegin;
+ $this->nextNode->put($this->explicitNode, $this->word[$this->nodeWordBegin[$next]], $next);
+
+ return false;
+ }
+
+ /**
+ * The canonize function as defined in Ukkonen's paper. Changes the
+ * reference pair (currentNode, (refWordBegin, refWordEnd)) into a canonical
+ * reference pair. It works on the "global" variables {@link #currentNode}
+ * and {@link #refWordBegin} and the parameter, writing the result back to
+ * the globals.
+ *
+ * @param int $refWordEnd one after the end index for the word of the reference pair
+ */
+ private function canonize(int $refWordEnd): void
+ {
+ if ($this->currentNode === -1) {
+ // explicitly handle trap state
+ $this->currentNode = 0;
+ $this->refWordBegin++;
+ }
+
+ if ($refWordEnd <= $this->refWordBegin) {
+ // empty word, so already canonical
+ return;
+ }
+
+ $next = $this->nextNode->get(
+ $this->currentNode,
+ $this->word[$this->refWordBegin]
+ );
+
+ while ($this->nodeWordEnd[$next] - $this->nodeWordBegin[$next] <= $refWordEnd
+ - $this->refWordBegin) {
+ $this->refWordBegin += $this->nodeWordEnd[$next] - $this->nodeWordBegin[$next];
+ $this->currentNode = $next;
+
+ if ($refWordEnd > $this->refWordBegin) {
+ $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]);
+ } else {
+ break;
+ }
+ }
+ }
+}
diff --git a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php
new file mode 100644
index 00000000..4bf807ee
--- /dev/null
+++ b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php
@@ -0,0 +1,232 @@
+
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree;
+
+/**
+ * The hash table used for the {@link SuffixTree} class. It is specifically
+ * written and optimized for its implementation and is thus probably of little
+ * use for any other application.
+ *