-
Notifications
You must be signed in to change notification settings - Fork 2k
Expand file tree
/
Copy pathcode_indexer.py
More file actions
1649 lines (1407 loc) · 65.1 KB
/
code_indexer.py
File metadata and controls
1649 lines (1407 loc) · 65.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Code Indexer for Repository Analysis
Analyzes code repositories to build comprehensive indexes for each subdirectory,
identifying file relationships and reusable components for implementation.
Features:
- Recursive file traversal
- LLM-powered code similarity analysis using augmented LLM classes
- JSON-based relationship storage
- Configurable matching strategies
- Progress tracking and error handling
- Automatic LLM provider selection based on API key availability
"""
import asyncio
import json
import logging
import os
import re
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import List, Dict, Any
# MCP Agent imports for LLM
from utils.llm_utils import get_preferred_llm_class, get_default_models
@dataclass
class FileRelationship:
"""Represents a relationship between a repo file and target structure file"""
repo_file_path: str
target_file_path: str
relationship_type: str # 'direct_match', 'partial_match', 'reference', 'utility'
confidence_score: float # 0.0 to 1.0
helpful_aspects: List[str]
potential_contributions: List[str]
usage_suggestions: str
@dataclass
class FileSummary:
"""Summary information for a repository file"""
file_path: str
file_type: str
main_functions: List[str]
key_concepts: List[str]
dependencies: List[str]
summary: str
lines_of_code: int
last_modified: str
@dataclass
class RepoIndex:
"""Complete index for a repository"""
repo_name: str
total_files: int
file_summaries: List[FileSummary]
relationships: List[FileRelationship]
analysis_metadata: Dict[str, Any]
class CodeIndexer:
"""Main class for building code repository indexes"""
def __init__(
self,
code_base_path: str = None,
target_structure: str = None,
output_dir: str = None,
config_path: str = "mcp_agent.secrets.yaml",
indexer_config_path: str = None,
enable_pre_filtering: bool = True,
):
# Load configurations first
self.config_path = config_path
self.indexer_config_path = indexer_config_path
# Derive main config path from secrets path (same directory)
secrets_dir = os.path.dirname(os.path.abspath(config_path))
self.main_config_path = os.path.join(secrets_dir, "mcp_agent.config.yaml")
self.api_config = self._load_api_config()
self.indexer_config = self._load_indexer_config()
self.default_models = get_default_models(self.main_config_path)
# Use config paths if not provided as parameters
paths_config = self.indexer_config.get("paths", {})
self.code_base_path = Path(
code_base_path or paths_config.get("code_base_path", "code_base")
)
self.output_dir = Path(output_dir or paths_config.get("output_dir", "indexes"))
self.target_structure = (
target_structure # This must be provided as it's project-specific
)
self.enable_pre_filtering = enable_pre_filtering
# LLM clients
self.llm_client = None
self.llm_client_type = None
# Initialize logger early
self.logger = self._setup_logger()
# Create output directory if it doesn't exist
self.output_dir.mkdir(parents=True, exist_ok=True)
# Load file analysis configuration
file_analysis_config = self.indexer_config.get("file_analysis", {})
self.supported_extensions = set(
file_analysis_config.get(
"supported_extensions",
[
".py",
".js",
".ts",
".java",
".cpp",
".c",
".h",
".hpp",
".cs",
".php",
".rb",
".go",
".rs",
".scala",
".kt",
".swift",
".m",
".mm",
".r",
".matlab",
".sql",
".sh",
".bat",
".ps1",
".yaml",
".yml",
".json",
".xml",
".toml",
],
)
)
self.skip_directories = set(
file_analysis_config.get(
"skip_directories",
[
"__pycache__",
"node_modules",
"target",
"build",
"dist",
"venv",
"env",
],
)
)
self.max_file_size = file_analysis_config.get("max_file_size", 1048576) # 1MB
self.max_content_length = file_analysis_config.get("max_content_length", 3000)
# Load LLM configuration
llm_config = self.indexer_config.get("llm", {})
self.model_provider = llm_config.get("model_provider", "anthropic")
self.llm_max_tokens = llm_config.get("max_tokens", 4000)
self.llm_temperature = llm_config.get("temperature", 0.3)
self.llm_system_prompt = llm_config.get(
"system_prompt",
"You are a code analysis expert. Provide precise, structured analysis of code relationships and similarities.",
)
self.request_delay = llm_config.get("request_delay", 0.1)
self.max_retries = llm_config.get("max_retries", 3)
self.retry_delay = llm_config.get("retry_delay", 1.0)
# Load relationship configuration
relationship_config = self.indexer_config.get("relationships", {})
self.min_confidence_score = relationship_config.get("min_confidence_score", 0.3)
self.high_confidence_threshold = relationship_config.get(
"high_confidence_threshold", 0.7
)
self.relationship_types = relationship_config.get(
"relationship_types",
{
"direct_match": 1.0,
"partial_match": 0.8,
"reference": 0.6,
"utility": 0.4,
},
)
# Load performance configuration
performance_config = self.indexer_config.get("performance", {})
self.enable_concurrent_analysis = performance_config.get(
"enable_concurrent_analysis", False
)
self.max_concurrent_files = performance_config.get("max_concurrent_files", 5)
self.enable_content_caching = performance_config.get(
"enable_content_caching", False
)
self.max_cache_size = performance_config.get("max_cache_size", 100)
# Load debug configuration
debug_config = self.indexer_config.get("debug", {})
self.save_raw_responses = debug_config.get("save_raw_responses", False)
self.raw_responses_dir = debug_config.get(
"raw_responses_dir", "debug_responses"
)
self.verbose_output = debug_config.get("verbose_output", False)
self.mock_llm_responses = debug_config.get("mock_llm_responses", False)
# Load output configuration
output_config = self.indexer_config.get("output", {})
self.generate_summary = output_config.get("generate_summary", True)
self.generate_statistics = output_config.get("generate_statistics", True)
self.include_metadata = output_config.get("include_metadata", True)
self.index_filename_pattern = output_config.get(
"index_filename_pattern", "{repo_name}_index.json"
)
self.summary_filename = output_config.get(
"summary_filename", "indexing_summary.json"
)
self.stats_filename = output_config.get(
"stats_filename", "indexing_statistics.json"
)
# Initialize caching if enabled
self.content_cache = {} if self.enable_content_caching else None
# Create debug directory if needed
if self.save_raw_responses:
Path(self.raw_responses_dir).mkdir(parents=True, exist_ok=True)
# Debug logging
if self.verbose_output:
self.logger.info(
f"Initialized CodeIndexer with config: {self.indexer_config_path}"
)
self.logger.info(f"Code base path: {self.code_base_path}")
self.logger.info(f"Output directory: {self.output_dir}")
self.logger.info(f"Model provider: {self.model_provider}")
self.logger.info(f"Concurrent analysis: {self.enable_concurrent_analysis}")
self.logger.info(f"Content caching: {self.enable_content_caching}")
self.logger.info(f"Mock LLM responses: {self.mock_llm_responses}")
def _setup_logger(self) -> logging.Logger:
"""Setup logging configuration from config file"""
logger = logging.getLogger("CodeIndexer")
# Get logging config
logging_config = self.indexer_config.get("logging", {})
log_level = logging_config.get("level", "INFO")
log_format = logging_config.get(
"log_format", "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
# Clear existing handlers
logger.handlers.clear()
# Console handler
handler = logging.StreamHandler()
formatter = logging.Formatter(log_format)
handler.setFormatter(formatter)
logger.addHandler(handler)
# File handler if enabled
if logging_config.get("log_to_file", False):
log_file = logging_config.get("log_file", "indexer.log")
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
return logger
def _load_api_config(self) -> Dict[str, Any]:
"""Load API configuration from YAML file"""
try:
import yaml
with open(self.config_path, "r", encoding="utf-8") as f:
return yaml.safe_load(f)
except Exception as e:
# Create a basic logger for this error since self.logger doesn't exist yet
print(f"Warning: Failed to load API config from {self.config_path}: {e}")
return {}
def _load_indexer_config(self) -> Dict[str, Any]:
"""Load indexer configuration from YAML file"""
try:
import yaml
with open(self.indexer_config_path, "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
if config is None:
config = {}
return config
except Exception as e:
print(
f"Warning: Failed to load indexer config from {self.indexer_config_path}: {e}"
)
print("Using default configuration values")
return {}
async def _initialize_llm_client(self):
"""Initialize LLM client (Anthropic or OpenAI) based on API key availability"""
if self.llm_client is not None:
return self.llm_client, self.llm_client_type
# Check if mock responses are enabled
if self.mock_llm_responses:
self.logger.info("Using mock LLM responses for testing")
self.llm_client = "mock"
self.llm_client_type = "mock"
return "mock", "mock"
# Check which API has available key and try that first
anthropic_key = self.api_config.get("anthropic", {}).get("api_key", "")
openai_key = self.api_config.get("openai", {}).get("api_key", "")
# Try Anthropic API first if key is available
if anthropic_key and anthropic_key.strip():
try:
from anthropic import AsyncAnthropic
client = AsyncAnthropic(api_key=anthropic_key)
# Test connection with default model from config
await client.messages.create(
model=self.default_models["anthropic"],
max_tokens=10,
messages=[{"role": "user", "content": "test"}],
)
self.logger.info(
f"Using Anthropic API with model: {self.default_models['anthropic']}"
)
self.llm_client = client
self.llm_client_type = "anthropic"
return client, "anthropic"
except Exception as e:
self.logger.warning(f"Anthropic API unavailable: {e}")
# Try OpenAI API if Anthropic failed or key not available
if openai_key and openai_key.strip():
try:
from openai import AsyncOpenAI
# Handle custom base_url if specified
openai_config = self.api_config.get("openai", {})
base_url = openai_config.get("base_url")
if base_url:
client = AsyncOpenAI(api_key=openai_key, base_url=base_url)
else:
client = AsyncOpenAI(api_key=openai_key)
# Test connection with default model from config
await client.chat.completions.create(
model=self.default_models["openai"],
max_tokens=10,
messages=[{"role": "user", "content": "test"}],
)
self.logger.info(
f"Using OpenAI API with model: {self.default_models['openai']}"
)
if base_url:
self.logger.info(f"Using custom base URL: {base_url}")
self.llm_client = client
self.llm_client_type = "openai"
return client, "openai"
except Exception as e:
self.logger.warning(f"OpenAI API unavailable: {e}")
raise ValueError(
"No available LLM API - please check your API keys in configuration"
)
async def _call_llm(
self, prompt: str, system_prompt: str = None, max_tokens: int = None
) -> str:
"""Call LLM for code analysis with retry mechanism and debugging support"""
if system_prompt is None:
system_prompt = self.llm_system_prompt
if max_tokens is None:
max_tokens = self.llm_max_tokens
# Mock response for testing
if self.mock_llm_responses:
mock_response = self._generate_mock_response(prompt)
if self.save_raw_responses:
self._save_debug_response("mock", prompt, mock_response)
return mock_response
last_error = None
# Retry mechanism
for attempt in range(self.max_retries):
try:
if self.verbose_output and attempt > 0:
self.logger.info(
f"LLM call attempt {attempt + 1}/{self.max_retries}"
)
client, client_type = await self._initialize_llm_client()
if client_type == "anthropic":
response = await client.messages.create(
model=self.default_models["anthropic"],
system=system_prompt,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
temperature=self.llm_temperature,
)
content = ""
for block in response.content:
if block.type == "text":
content += block.text
# Save debug response if enabled
if self.save_raw_responses:
self._save_debug_response("anthropic", prompt, content)
return content
elif client_type == "openai":
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
response = await client.chat.completions.create(
model=self.default_models["openai"],
messages=messages,
max_tokens=max_tokens,
temperature=self.llm_temperature,
)
content = response.choices[0].message.content or ""
# Save debug response if enabled
if self.save_raw_responses:
self._save_debug_response("openai", prompt, content)
return content
else:
raise ValueError(f"Unsupported client type: {client_type}")
except Exception as e:
last_error = e
self.logger.warning(f"LLM call attempt {attempt + 1} failed: {e}")
if attempt < self.max_retries - 1:
await asyncio.sleep(
self.retry_delay * (attempt + 1)
) # Exponential backoff
# All retries failed
error_msg = f"LLM call failed after {self.max_retries} attempts. Last error: {str(last_error)}"
self.logger.error(error_msg)
return f"Error in LLM analysis: {error_msg}"
def _generate_mock_response(self, prompt: str) -> str:
"""Generate mock LLM response for testing"""
if "JSON format" in prompt and "file_type" in prompt:
# File analysis mock
return """
{
"file_type": "Python module",
"main_functions": ["main_function", "helper_function"],
"key_concepts": ["data_processing", "algorithm"],
"dependencies": ["numpy", "pandas"],
"summary": "Mock analysis of code file functionality."
}
"""
elif "relationships" in prompt:
# Relationship analysis mock
return """
{
"relationships": [
{
"target_file_path": "src/core/mock.py",
"relationship_type": "partial_match",
"confidence_score": 0.8,
"helpful_aspects": ["algorithm implementation", "data structures"],
"potential_contributions": ["core functionality", "utility methods"],
"usage_suggestions": "Mock relationship suggestion for testing."
}
]
}
"""
elif "relevant_files" in prompt:
# File filtering mock
return """
{
"relevant_files": [
{
"file_path": "mock_file.py",
"relevance_reason": "Mock relevance reason",
"confidence": 0.9,
"expected_contribution": "Mock contribution"
}
],
"summary": {
"total_files_analyzed": "10",
"relevant_files_count": "1",
"filtering_strategy": "Mock filtering strategy"
}
}
"""
else:
return "Mock LLM response for testing purposes."
def _save_debug_response(self, provider: str, prompt: str, response: str):
"""Save LLM response for debugging"""
try:
import hashlib
from datetime import datetime
# Create a hash of the prompt for filename
prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:8]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{provider}_{timestamp}_{prompt_hash}.json"
debug_data = {
"timestamp": datetime.now().isoformat(),
"provider": provider,
"prompt": prompt[:500] + "..." if len(prompt) > 500 else prompt,
"response": response,
"full_prompt_length": len(prompt),
}
debug_file = Path(self.raw_responses_dir) / filename
with open(debug_file, "w", encoding="utf-8") as f:
json.dump(debug_data, f, indent=2, ensure_ascii=False)
except Exception as e:
self.logger.warning(f"Failed to save debug response: {e}")
def get_all_repo_files(self, repo_path: Path) -> List[Path]:
"""Recursively get all supported files in a repository"""
files = []
try:
for root, dirs, filenames in os.walk(repo_path):
# Skip common non-code directories
dirs[:] = [
d
for d in dirs
if not d.startswith(".") and d not in self.skip_directories
]
for filename in filenames:
file_path = Path(root) / filename
if file_path.suffix.lower() in self.supported_extensions:
files.append(file_path)
except Exception as e:
self.logger.error(f"Error traversing {repo_path}: {e}")
return files
def generate_file_tree(self, repo_path: Path, max_depth: int = 5) -> str:
"""Generate file tree structure string for the repository"""
tree_lines = []
def add_to_tree(current_path: Path, prefix: str = "", depth: int = 0):
if depth > max_depth:
return
try:
items = sorted(
current_path.iterdir(), key=lambda x: (x.is_file(), x.name.lower())
)
# Filter out irrelevant directories and files
items = [
item
for item in items
if not item.name.startswith(".")
and item.name not in self.skip_directories
]
for i, item in enumerate(items):
is_last = i == len(items) - 1
current_prefix = "└── " if is_last else "├── "
tree_lines.append(f"{prefix}{current_prefix}{item.name}")
if item.is_dir():
extension_prefix = " " if is_last else "│ "
add_to_tree(item, prefix + extension_prefix, depth + 1)
elif item.suffix.lower() in self.supported_extensions:
# Add file size information
try:
size = item.stat().st_size
if size > 1024:
size_str = f" ({size // 1024}KB)"
else:
size_str = f" ({size}B)"
tree_lines[-1] += size_str
except (OSError, PermissionError):
pass
except PermissionError:
tree_lines.append(f"{prefix}├── [Permission Denied]")
except Exception as e:
tree_lines.append(f"{prefix}├── [Error: {str(e)}]")
tree_lines.append(f"{repo_path.name}/")
add_to_tree(repo_path)
return "\n".join(tree_lines)
async def pre_filter_files(self, repo_path: Path, file_tree: str) -> List[str]:
"""Use LLM to pre-filter relevant files based on target structure"""
filter_prompt = f"""
You are a code analysis expert. Please analyze the following code repository file tree based on the target project structure and filter out files that may be relevant to the target project.
Target Project Structure:
{self.target_structure}
Code Repository File Tree:
{file_tree}
Please analyze which files might be helpful for implementing the target project structure, including:
- Core algorithm implementation files (such as GCN, recommendation systems, graph neural networks, etc.)
- Data processing and preprocessing files
- Loss functions and evaluation metric files
- Configuration and utility files
- Test files
- Documentation files
Please return the filtering results in JSON format:
{{
"relevant_files": [
{{
"file_path": "file path relative to repository root",
"relevance_reason": "why this file is relevant",
"confidence": 0.0-1.0,
"expected_contribution": "expected contribution to the target project"
}}
],
"summary": {{
"total_files_analyzed": "total number of files analyzed",
"relevant_files_count": "number of relevant files",
"filtering_strategy": "explanation of filtering strategy"
}}
}}
Only return files with confidence > {self.min_confidence_score}. Focus on files related to recommendation systems, graph neural networks, and diffusion models.
"""
try:
self.logger.info("Starting LLM pre-filtering of files...")
llm_response = await self._call_llm(
filter_prompt,
system_prompt="You are a professional code analysis and project architecture expert, skilled at identifying code file functionality and relevance.",
max_tokens=2000,
)
# Parse JSON response
match = re.search(r"\{.*\}", llm_response, re.DOTALL)
if not match:
self.logger.warning(
"Unable to parse LLM filtering response, will use all files"
)
return []
filter_data = json.loads(match.group(0))
relevant_files = filter_data.get("relevant_files", [])
# Extract file paths
selected_files = []
for file_info in relevant_files:
file_path = file_info.get("file_path", "")
confidence = file_info.get("confidence", 0.0)
# Use configured minimum confidence threshold
if file_path and confidence > self.min_confidence_score:
selected_files.append(file_path)
summary = filter_data.get("summary", {})
self.logger.info(
f"LLM filtering completed: {summary.get('relevant_files_count', len(selected_files))} relevant files selected"
)
self.logger.info(
f"Filtering strategy: {summary.get('filtering_strategy', 'Not provided')}"
)
return selected_files
except Exception as e:
self.logger.error(f"LLM pre-filtering failed: {e}")
self.logger.info("Will fallback to analyzing all files")
return []
def filter_files_by_paths(
self, all_files: List[Path], selected_paths: List[str], repo_path: Path
) -> List[Path]:
"""Filter file list based on LLM-selected paths"""
if not selected_paths:
return all_files
filtered_files = []
for file_path in all_files:
# Get path relative to repository root
relative_path = str(file_path.relative_to(repo_path))
# Check if it's in the selected list
for selected_path in selected_paths:
# Normalize path comparison
if (
relative_path == selected_path
or relative_path.replace("\\", "/")
== selected_path.replace("\\", "/")
or selected_path in relative_path
or relative_path in selected_path
):
filtered_files.append(file_path)
break
return filtered_files
def _get_cache_key(self, file_path: Path) -> str:
"""Generate cache key for file content"""
try:
stats = file_path.stat()
return f"{file_path}:{stats.st_mtime}:{stats.st_size}"
except (OSError, PermissionError):
return str(file_path)
def _manage_cache_size(self):
"""Manage cache size to stay within limits"""
if not self.enable_content_caching or not self.content_cache:
return
if len(self.content_cache) > self.max_cache_size:
# Remove oldest entries (simple FIFO strategy)
excess_count = len(self.content_cache) - self.max_cache_size + 10
keys_to_remove = list(self.content_cache.keys())[:excess_count]
for key in keys_to_remove:
del self.content_cache[key]
if self.verbose_output:
self.logger.info(
f"Cache cleaned: removed {excess_count} entries, {len(self.content_cache)} entries remaining"
)
async def analyze_file_content(self, file_path: Path) -> FileSummary:
"""Analyze a single file and create summary with caching support"""
try:
# Check file size before reading
file_size = file_path.stat().st_size
if file_size > self.max_file_size:
self.logger.warning(
f"Skipping file {file_path} - size {file_size} bytes exceeds limit {self.max_file_size}"
)
return FileSummary(
file_path=str(file_path.relative_to(self.code_base_path)),
file_type="skipped - too large",
main_functions=[],
key_concepts=[],
dependencies=[],
summary=f"File skipped - size {file_size} bytes exceeds {self.max_file_size} byte limit",
lines_of_code=0,
last_modified=datetime.fromtimestamp(
file_path.stat().st_mtime
).isoformat(),
)
# Check cache if enabled
cache_key = None
if self.enable_content_caching:
cache_key = self._get_cache_key(file_path)
if cache_key in self.content_cache:
if self.verbose_output:
self.logger.info(f"Using cached analysis for {file_path.name}")
return self.content_cache[cache_key]
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
# Get file stats
stats = file_path.stat()
lines_of_code = len([line for line in content.split("\n") if line.strip()])
# Truncate content based on config
content_for_analysis = content[: self.max_content_length]
content_suffix = "..." if len(content) > self.max_content_length else ""
# Create analysis prompt
analysis_prompt = f"""
Analyze this code file and provide a structured summary:
File: {file_path.name}
Content:
```
{content_for_analysis}{content_suffix}
```
Please provide analysis in this JSON format:
{{
"file_type": "description of what type of file this is",
"main_functions": ["list", "of", "main", "functions", "or", "classes"],
"key_concepts": ["important", "concepts", "algorithms", "patterns"],
"dependencies": ["external", "libraries", "or", "imports"],
"summary": "2-3 sentence summary of what this file does"
}}
Focus on the core functionality and potential reusability.
"""
# Get LLM analysis with configured parameters
llm_response = await self._call_llm(analysis_prompt, max_tokens=1000)
try:
# Try to parse JSON response
match = re.search(r"\{.*\}", llm_response, re.DOTALL)
analysis_data = json.loads(match.group(0))
except json.JSONDecodeError:
# Fallback to basic analysis if JSON parsing fails
analysis_data = {
"file_type": f"{file_path.suffix} file",
"main_functions": [],
"key_concepts": [],
"dependencies": [],
"summary": "File analysis failed - JSON parsing error",
}
file_summary = FileSummary(
file_path=str(file_path.relative_to(self.code_base_path)),
file_type=analysis_data.get("file_type", "unknown"),
main_functions=analysis_data.get("main_functions", []),
key_concepts=analysis_data.get("key_concepts", []),
dependencies=analysis_data.get("dependencies", []),
summary=analysis_data.get("summary", "No summary available"),
lines_of_code=lines_of_code,
last_modified=datetime.fromtimestamp(stats.st_mtime).isoformat(),
)
# Cache the result if caching is enabled
if self.enable_content_caching and cache_key:
self.content_cache[cache_key] = file_summary
self._manage_cache_size()
return file_summary
except Exception as e:
self.logger.error(f"Error analyzing file {file_path}: {e}")
return FileSummary(
file_path=str(file_path.relative_to(self.code_base_path)),
file_type="error",
main_functions=[],
key_concepts=[],
dependencies=[],
summary=f"Analysis failed: {str(e)}",
lines_of_code=0,
last_modified="",
)
async def find_relationships(
self, file_summary: FileSummary
) -> List[FileRelationship]:
"""Find relationships between a repo file and target structure"""
# Build relationship type description from config
relationship_type_desc = []
for rel_type, weight in self.relationship_types.items():
relationship_type_desc.append(f"- {rel_type} (priority: {weight})")
relationship_prompt = f"""
Analyze the relationship between this existing code file and the target project structure.
Existing File Analysis:
- Path: {file_summary.file_path}
- Type: {file_summary.file_type}
- Functions: {', '.join(file_summary.main_functions)}
- Concepts: {', '.join(file_summary.key_concepts)}
- Summary: {file_summary.summary}
Target Project Structure:
{self.target_structure}
Available relationship types (with priority weights):
{chr(10).join(relationship_type_desc)}
Identify potential relationships and provide analysis in this JSON format:
{{
"relationships": [
{{
"target_file_path": "path/in/target/structure",
"relationship_type": "direct_match|partial_match|reference|utility",
"confidence_score": 0.0-1.0,
"helpful_aspects": ["specific", "aspects", "that", "could", "help"],
"potential_contributions": ["how", "this", "could", "contribute"],
"usage_suggestions": "detailed suggestion on how to use this file"
}}
]
}}
Consider the priority weights when determining relationship types. Higher weight types should be preferred when multiple types apply.
Only include relationships with confidence > {self.min_confidence_score}. Focus on concrete, actionable connections.
"""
try:
llm_response = await self._call_llm(relationship_prompt, max_tokens=1500)
match = re.search(r"\{.*\}", llm_response, re.DOTALL)
relationship_data = json.loads(match.group(0))
relationships = []
for rel_data in relationship_data.get("relationships", []):
confidence_score = float(rel_data.get("confidence_score", 0.0))
relationship_type = rel_data.get("relationship_type", "reference")
# Validate relationship type is in config
if relationship_type not in self.relationship_types:
if self.verbose_output:
self.logger.warning(
f"Unknown relationship type '{relationship_type}', using 'reference'"
)
relationship_type = "reference"
# Apply configured minimum confidence filter
if confidence_score > self.min_confidence_score:
relationship = FileRelationship(
repo_file_path=file_summary.file_path,
target_file_path=rel_data.get("target_file_path", ""),
relationship_type=relationship_type,
confidence_score=confidence_score,
helpful_aspects=rel_data.get("helpful_aspects", []),
potential_contributions=rel_data.get(
"potential_contributions", []
),
usage_suggestions=rel_data.get("usage_suggestions", ""),
)
relationships.append(relationship)
return relationships
except Exception as e:
self.logger.error(
f"Error finding relationships for {file_summary.file_path}: {e}"
)
return []
async def _analyze_single_file_with_relationships(
self, file_path: Path, index: int, total: int
) -> tuple:
"""Analyze a single file and its relationships (for concurrent processing)"""
if self.verbose_output:
self.logger.info(f"Analyzing file {index}/{total}: {file_path.name}")
# Get file summary
file_summary = await self.analyze_file_content(file_path)
# Find relationships
relationships = await self.find_relationships(file_summary)
return file_summary, relationships
async def process_repository(self, repo_path: Path) -> RepoIndex:
"""Process a single repository and create complete index with optional concurrent processing"""
repo_name = repo_path.name
self.logger.info(f"Processing repository: {repo_name}")
# Step 1: Generate file tree
self.logger.info("Generating file tree structure...")
file_tree = self.generate_file_tree(repo_path)
# Step 2: Get all files
all_files = self.get_all_repo_files(repo_path)
self.logger.info(f"Found {len(all_files)} files in {repo_name}")
# Step 3: LLM pre-filtering of relevant files
if self.enable_pre_filtering:
self.logger.info("Using LLM for file pre-filtering...")
selected_file_paths = await self.pre_filter_files(repo_path, file_tree)
else:
self.logger.info("Pre-filtering is disabled, will analyze all files")
selected_file_paths = []
# Step 4: Filter file list based on filtering results
if selected_file_paths:
files_to_analyze = self.filter_files_by_paths(
all_files, selected_file_paths, repo_path
)
self.logger.info(
f"After LLM filtering, will analyze {len(files_to_analyze)} relevant files (from {len(all_files)} total)"
)
else:
files_to_analyze = all_files
self.logger.info("LLM filtering failed, will analyze all files")
# Step 5: Analyze filtered files (concurrent or sequential)
if self.enable_concurrent_analysis and len(files_to_analyze) > 1: