infio-copilot/src/core/diff/strategies/new-unified/search-strategies.ts

import { compareTwoStrings } from "string-similarity"
import { closest } from "fastest-levenshtein"
import { diff_match_patch } from "diff-match-patch"
import { Change, Hunk } from "./types"

export type SearchResult = {
	index: number
	confidence: number
	strategy: string
}

const LARGE_FILE_THRESHOLD = 1000 // lines
const UNIQUE_CONTENT_BOOST = 0.05
const DEFAULT_OVERLAP_SIZE = 3 // lines of overlap between windows
const MAX_WINDOW_SIZE = 500 // maximum lines in a window

// Helper function to calculate adaptive confidence threshold based on file size
function getAdaptiveThreshold(contentLength: number, baseThreshold: number): number {
	if (contentLength <= LARGE_FILE_THRESHOLD) {
		return baseThreshold
	}
	return Math.max(baseThreshold - 0.07, 0.8) // Reduce threshold for large files but keep minimum at 80%
}

// Helper function to evaluate content uniqueness
function evaluateContentUniqueness(searchStr: string, content: string[]): number {
	const searchLines = searchStr.split("\n")
	const uniqueLines = new Set(searchLines)
	const contentStr = content.join("\n")

	// Calculate how many search lines are relatively unique in the content
	let uniqueCount = 0
	for (const line of uniqueLines) {
		const regex = new RegExp(line.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g")
		const matches = contentStr.match(regex)
		if (matches && matches.length <= 2) {
			// Line appears at most twice
			uniqueCount++
		}
	}

	return uniqueCount / uniqueLines.size
}

// Helper function to prepare search string from context
export function prepareSearchString(changes: Change[]): string {
	const lines = changes.filter((c) => c.type === "context" || c.type === "remove").map((c) => c.originalLine)
	return lines.join("\n")
}

// Helper function to evaluate similarity between two texts
export function evaluateSimilarity(original: string, modified: string): number {
	return compareTwoStrings(original, modified)
}

// Helper function to validate using diff-match-patch
export function getDMPSimilarity(original: string, modified: string): number {
	const dmp = new diff_match_patch()
	const diffs = dmp.diff_main(original, modified)
	dmp.diff_cleanupSemantic(diffs)
	const patches = dmp.patch_make(original, diffs)
	const [expectedText] = dmp.patch_apply(patches, original)

	const similarity = evaluateSimilarity(expectedText, modified)
	return similarity
}

// Helper function to validate edit results using hunk information
export function validateEditResult(hunk: Hunk, result: string): number {
	// Build the expected text from the hunk
	const expectedText = hunk.changes
		.filter((change) => change.type === "context" || change.type === "add")
		.map((change) => (change.indent ? change.indent + change.content : change.content))
		.join("\n")

	// Calculate similarity between the result and expected text
	const similarity = getDMPSimilarity(expectedText, result)

	// If the result is unchanged from original, return low confidence
	const originalText = hunk.changes
		.filter((change) => change.type === "context" || change.type === "remove")
		.map((change) => (change.indent ? change.indent + change.content : change.content))
		.join("\n")

	const originalSimilarity = getDMPSimilarity(originalText, result)
	if (originalSimilarity > 0.97 && similarity !== 1) {
		return 0.8 * similarity // Some confidence since we found the right location
	}

	// For partial matches, scale the confidence but keep it high if we're close
	return similarity
}

// Helper function to validate context lines against original content
function validateContextLines(searchStr: string, content: string, confidenceThreshold: number): number {
	// Extract just the context lines from the search string
	const contextLines = searchStr.split("\n").filter((line) => !line.startsWith("-")) // Exclude removed lines

	// Compare context lines with content
	const similarity = evaluateSimilarity(contextLines.join("\n"), content)

	// Get adaptive threshold based on content size
	const threshold = getAdaptiveThreshold(content.split("\n").length, confidenceThreshold)

	// Calculate uniqueness boost
	const uniquenessScore = evaluateContentUniqueness(searchStr, content.split("\n"))
	const uniquenessBoost = uniquenessScore * UNIQUE_CONTENT_BOOST

	// Adjust confidence based on threshold and uniqueness
	return similarity < threshold ? similarity * 0.3 + uniquenessBoost : similarity + uniquenessBoost
}

// Helper function to create overlapping windows
function createOverlappingWindows(
	content: string[],
	searchSize: number,
	overlapSize: number = DEFAULT_OVERLAP_SIZE,
): { window: string[]; startIndex: number }[] {
	const windows: { window: string[]; startIndex: number }[] = []

	// Ensure minimum window size is at least searchSize
	const effectiveWindowSize = Math.max(searchSize, Math.min(searchSize * 2, MAX_WINDOW_SIZE))

	// Ensure overlap size doesn't exceed window size
	const effectiveOverlapSize = Math.min(overlapSize, effectiveWindowSize - 1)

	// Calculate step size, ensure it's at least 1
	const stepSize = Math.max(1, effectiveWindowSize - effectiveOverlapSize)

	for (let i = 0; i < content.length; i += stepSize) {
		const windowContent = content.slice(i, i + effectiveWindowSize)
		if (windowContent.length >= searchSize) {
			windows.push({ window: windowContent, startIndex: i })
		}
	}

	return windows
}

// Helper function to combine overlapping matches
function combineOverlappingMatches(
	matches: (SearchResult & { windowIndex: number })[],
	overlapSize: number = DEFAULT_OVERLAP_SIZE,
): SearchResult[] {
	if (matches.length === 0) {
		return []
	}

	// Sort matches by confidence
	matches.sort((a, b) => b.confidence - a.confidence)

	const combinedMatches: SearchResult[] = []
	const usedIndices = new Set<number>()

	for (const match of matches) {
		if (usedIndices.has(match.windowIndex)) {
			continue
		}

		// Find overlapping matches
		const overlapping = matches.filter(
			(m) =>
				Math.abs(m.windowIndex - match.windowIndex) === 1 &&
				Math.abs(m.index - match.index) <= overlapSize &&
				!usedIndices.has(m.windowIndex),
		)

		if (overlapping.length > 0) {
			// Boost confidence if we find same match in overlapping windows
			const avgConfidence =
				(match.confidence + overlapping.reduce((sum, m) => sum + m.confidence, 0)) / (overlapping.length + 1)
			const boost = Math.min(0.05 * overlapping.length, 0.1) // Max 10% boost

			combinedMatches.push({
				index: match.index,
				confidence: Math.min(1, avgConfidence + boost),
				strategy: `${match.strategy}-overlapping`,
			})

			usedIndices.add(match.windowIndex)
			overlapping.forEach((m) => usedIndices.add(m.windowIndex))
		} else {
			combinedMatches.push({
				index: match.index,
				confidence: match.confidence,
				strategy: match.strategy,
			})
			usedIndices.add(match.windowIndex)
		}
	}

	return combinedMatches
}

export function findExactMatch(
	searchStr: string,
	content: string[],
	startIndex: number = 0,
	confidenceThreshold: number = 0.97,
): SearchResult {
	const searchLines = searchStr.split("\n")
	const windows = createOverlappingWindows(content.slice(startIndex), searchLines.length)
	const matches: (SearchResult & { windowIndex: number })[] = []

	windows.forEach((windowData, windowIndex) => {
		const windowStr = windowData.window.join("\n")
		const exactMatch = windowStr.indexOf(searchStr)

		if (exactMatch !== -1) {
			const matchedContent = windowData.window
				.slice(
					windowStr.slice(0, exactMatch).split("\n").length - 1,
					windowStr.slice(0, exactMatch).split("\n").length - 1 + searchLines.length,
				)
				.join("\n")

			const similarity = getDMPSimilarity(searchStr, matchedContent)
			const contextSimilarity = validateContextLines(searchStr, matchedContent, confidenceThreshold)
			const confidence = Math.min(similarity, contextSimilarity)

			matches.push({
				index: startIndex + windowData.startIndex + windowStr.slice(0, exactMatch).split("\n").length - 1,
				confidence,
				strategy: "exact",
				windowIndex,
			})
		}
	})

	const combinedMatches = combineOverlappingMatches(matches)
	return combinedMatches.length > 0 ? combinedMatches[0] : { index: -1, confidence: 0, strategy: "exact" }
}

// String similarity strategy
export function findSimilarityMatch(
	searchStr: string,
	content: string[],
	startIndex: number = 0,
	confidenceThreshold: number = 0.97,
): SearchResult {
	const searchLines = searchStr.split("\n")
	let bestScore = 0
	let bestIndex = -1

	for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
		const windowStr = content.slice(i, i + searchLines.length).join("\n")
		const score = compareTwoStrings(searchStr, windowStr)
		if (score > bestScore && score >= confidenceThreshold) {
			const similarity = getDMPSimilarity(searchStr, windowStr)
			const contextSimilarity = validateContextLines(searchStr, windowStr, confidenceThreshold)
			const adjustedScore = Math.min(similarity, contextSimilarity) * score

			if (adjustedScore > bestScore) {
				bestScore = adjustedScore
				bestIndex = i
			}
		}
	}

	return {
		index: bestIndex,
		confidence: bestIndex !== -1 ? bestScore : 0,
		strategy: "similarity",
	}
}

// Levenshtein strategy
export function findLevenshteinMatch(
	searchStr: string,
	content: string[],
	startIndex: number = 0,
	confidenceThreshold: number = 0.97,
): SearchResult {
	const searchLines = searchStr.split("\n")
	const candidates = []

	for (let i = startIndex; i < content.length - searchLines.length + 1; i++) {
		candidates.push(content.slice(i, i + searchLines.length).join("\n"))
	}

	if (candidates.length > 0) {
		const closestMatch = closest(searchStr, candidates)
		const index = startIndex + candidates.indexOf(closestMatch)
		const similarity = getDMPSimilarity(searchStr, closestMatch)
		const contextSimilarity = validateContextLines(searchStr, closestMatch, confidenceThreshold)
		const confidence = Math.min(similarity, contextSimilarity)
		return {
			index: confidence === 0 ? -1 : index,
			confidence: index !== -1 ? confidence : 0,
			strategy: "levenshtein",
		}
	}

	return { index: -1, confidence: 0, strategy: "levenshtein" }
}

// Helper function to identify anchor lines
function identifyAnchors(searchStr: string): { first: string | null; last: string | null } {
	const searchLines = searchStr.split("\n")
	let first: string | null = null
	let last: string | null = null

	// Find the first non-empty line
	for (const line of searchLines) {
		if (line.trim()) {
			first = line
			break
		}
	}

	// Find the last non-empty line
	for (let i = searchLines.length - 1; i >= 0; i--) {
		if (searchLines[i].trim()) {
			last = searchLines[i]
			break
		}
	}

	return { first, last }
}

// Anchor-based search strategy
export function findAnchorMatch(
	searchStr: string,
	content: string[],
	startIndex: number = 0,
	confidenceThreshold: number = 0.97,
): SearchResult {
	const searchLines = searchStr.split("\n")
	const { first, last } = identifyAnchors(searchStr)

	if (!first || !last) {
		return { index: -1, confidence: 0, strategy: "anchor" }
	}

	let firstIndex = -1
	let lastIndex = -1

	// Check if the first anchor is unique
	let firstOccurrences = 0
	for (const contentLine of content) {
		if (contentLine === first) {
			firstOccurrences++
		}
	}

	if (firstOccurrences !== 1) {
		return { index: -1, confidence: 0, strategy: "anchor" }
	}

	// Find the first anchor
	for (let i = startIndex; i < content.length; i++) {
		if (content[i] === first) {
			firstIndex = i
			break
		}
	}

	// Find the last anchor
	for (let i = content.length - 1; i >= startIndex; i--) {
		if (content[i] === last) {
			lastIndex = i
			break
		}
	}

	if (firstIndex === -1 || lastIndex === -1 || lastIndex <= firstIndex) {
		return { index: -1, confidence: 0, strategy: "anchor" }
	}

	// Validate the context
	const expectedContext = searchLines.slice(searchLines.indexOf(first) + 1, searchLines.indexOf(last)).join("\n")
	const actualContext = content.slice(firstIndex + 1, lastIndex).join("\n")
	const contextSimilarity = evaluateSimilarity(expectedContext, actualContext)

	if (contextSimilarity < getAdaptiveThreshold(content.length, confidenceThreshold)) {
		return { index: -1, confidence: 0, strategy: "anchor" }
	}

	const confidence = 1

	return {
		index: firstIndex,
		confidence: confidence,
		strategy: "anchor",
	}
}

// Main search function that tries all strategies
export function findBestMatch(
	searchStr: string,
	content: string[],
	startIndex: number = 0,
	confidenceThreshold: number = 0.97,
): SearchResult {
	const strategies = [findExactMatch, findAnchorMatch, findSimilarityMatch, findLevenshteinMatch]

	let bestResult: SearchResult = { index: -1, confidence: 0, strategy: "none" }

	for (const strategy of strategies) {
		const result = strategy(searchStr, content, startIndex, confidenceThreshold)
		if (result.confidence > bestResult.confidence) {
			bestResult = result
		}
	}

	return bestResult
}