CodeReviewerAi · johan-t · Jan 3, 2024 · Dec 27, 2023 · Dec 28, 2023 · Dec 29, 2023
diff --git a/.github/workflows/get_function_data_test.yml b/.github/workflows/get_function_data_test.yml
@@ -6,19 +6,41 @@ on:
 
 jobs:
   test:
-
     runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v2
+      with:
+        path: 'functionRetriever'  # Specify a path for the main repo
+
+    - name: Clone testRepo repository
+      run: |
+        mkdir -p ${{ github.workspace }}/inputData  # Create inputData directory
+        git clone https://github.com/RapidReview-ai/testRepo ${{ github.workspace }}/inputData/testRepo
+
+    - name: Create outputData directory
+      run: mkdir -p ${{ github.workspace }}/functionRetriever/outputData
+
     - name: Set up Python 3.x
       uses: actions/setup-python@v2
       with:
         python-version: '3.11.1'
-    - name: Install dependencies
+
+    - name: Set up Node.js
+      uses: actions/setup-node@v2
+      with:
+        node-version: '18.12.1'
+
+    - name: Install npm dependencies
+      run: npm install
+      working-directory: ${{ github.workspace }}/functionRetriever
+
+    - name: Install Python dependencies
       run: |
         pip install --upgrade pip
-        # Install any other dependencies your project requires:
-        # pip install -r requirements.txt
+        pip install -r requirements.txt
+      working-directory: ${{ github.workspace }}/functionRetriever
+
     - name: Run tests
       run: python test_get_function_data.py
+      working-directory: ${{ github.workspace }}/functionRetriever
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,9 @@
 .env
-**.json
+function_changes.json
+test_function_changes.json
+package-lock.json
 path
 .DS_Store
-**/__pycache__
+**/__pycache__
+node_modules
+temp.js
diff --git a/babelParser.js b/babelParser.js
@@ -0,0 +1,14 @@
+const babel = require('@babel/parser');
+const fs = require('fs');
+
+const code = fs.readFileSync(process.argv[2], 'utf8');
+
+try {
+  const ast = babel.parse(code, {
+    sourceType: "module",
+    plugins: [],
+  });
+  console.log(JSON.stringify(ast));
+} catch (error) {
+  console.error("Parsing error:", error);
+}
diff --git a/createEmbeddings.py b/createEmbeddings.py
@@ -7,7 +7,7 @@
 from qdrant_client import QdrantClient
 from qdrant_client.models import CollectionDescription, Distance, VectorParams, Record
 
-def embed_sample_functions():
+def embed_sample_functions(repo_path):
     # Initialize Qdrant Client
     client = QdrantClient(host='localhost', port=6333)
     # client = QdrantClient(":memory:")
@@ -17,7 +17,7 @@ def embed_sample_functions():
     openai.api_key = os.getenv("OPENAI_API_KEY")
 
     # Load the JSON data from the file
-    json_file_path = 'outputData/function_changes.json' # depends on how you run the file, should be changed to be global and not local path
+    json_file_path = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json'
     with open(json_file_path, 'r') as file:
         json_data = json.load(file)
 

diff --git a/getFunctionData.py b/getFunctionData.py
@@ -1,92 +1,202 @@
+import os
 import git
 import json
-import re
-import os
+import time
+import subprocess
 
-def get_function_data(repo_path='../inputData/testRepo2'):
-    # Determine the output file based on the original repo_path
-    output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo2') else 'outputData/function_changes.json'
-
-    # Determine the directory where this script is located
+def get_function_data(repo_path='../inputData/testRepo'):
+    output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json'
     script_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # Construct the path to your repository relative to the script's location
     repo_path = os.path.join(script_dir, repo_path)
-    repo = git.Repo(repo_path) 
-
-    # Pull the latest changes from the main branch
+    repo = git.Repo(repo_path)
+
     repo.git.checkout('main')
     repo.git.pull()
 
     merge_commits = [commit for commit in repo.iter_commits('main') if commit.parents and len(commit.parents) > 1]
-    merge_commits.reverse()  # Reverse the list to get the oldest merge commit first
+    merge_commits.reverse()
+
+    def create_temp_file_and_get_ast(file_content, temp_file_path='temp.js'):
+        with open(temp_file_path, 'w') as f:
+            f.write(file_content)
+        ast = get_ast_from_js(file_content, temp_file_path)
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)  # Clean up the temporary file
+        return ast
+
+    def get_ast_from_js(file_content, temp_file_path):
+        with open(temp_file_path, 'w') as temp_file:
+            temp_file.write(file_content)
+        result = subprocess.run(['node', 'babelParser.js', temp_file_path], capture_output=True, text=True)
+        if result.stderr:
+            print("Error in parsing:", result.stderr)
+            return None
+        return json.loads(result.stdout)
+
+    def get_functions_from_file(file_content):
+
+        # create ast from file content
+        ast = create_temp_file_and_get_ast(file_content)
+
+        functions = []
+        try:
+            # Traverse the AST to find function declarations
+            def traverse(node):
+                if not isinstance(node, dict):
+                    return
+
+                if 'type' in node:
+                    # Check for arrow functions or function expressions assigned to variables
+                    if node['type'] in ['VariableDeclarator'] and 'init' in node:
+                        init_node = node['init']
+                        if init_node and 'type' in init_node and init_node['type'] in ['FunctionExpression', 'ArrowFunctionExpression']:
+                            function_name = None
+                            if 'name' in node['id']:
+                                function_name = node['id']['name']
+                            if function_name:
+                                functions.append(function_name)
+
+                    # Existing checks for FunctionDeclaration, etc.
+                    elif node['type'] in ['FunctionDeclaration', 'FunctionExpression', 'ArrowFunctionExpression']:
+                        function_name = None
+                        if 'id' in node and node['id'] is not None:
+                            function_name = node['id']['name']
+                        elif 'key' in node and 'name' in node['key']:
+                            function_name = node['key']['name']
+                        if function_name:
+                            functions.append(function_name)
+
+                    # Check for methods in classes
+                    if node['type'] == 'MethodDefinition' and 'key' in node and node['key']['type'] == 'Identifier':
+                        functions.append(node['key']['name'])
+
+                # Recursively traverse child nodes
+                for key, value in node.items():
+                    if isinstance(value, dict):
+                        traverse(value)
+                    elif isinstance(value, list):
+                        for item in value:
+                            if isinstance(item, dict):
+                                traverse(item)
+
+            traverse(ast['program'])
+        except Exception as e:
+            print(f"Error processing AST: {e}")
+        return functions
+
+    def normalize_change_counts(functions):
+        # Find the min and max changes after merge
+        min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
+        max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
+
+        # Normalize the change counts between -1 and 1
+        for func_key, func_info in functions.items():
+            if max_changes != min_changes:
+                normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1
+            else:
+                normalized_score = 0
+            func_info['score'] = normalized_score
+
+        return functions
 
-    def get_func_name(diff):
-        pattern = re.compile(r'function\s+([^\(]+)\s*\(([^)]*)\)\s*{', re.MULTILINE)
-        return pattern.findall(diff)
 
     def get_full_function_at_commit(repo, commit_hash, function_name, file_path):
         commit = repo.commit(commit_hash)
         blob = commit.tree / file_path
         file_content = blob.data_stream.read().decode('utf-8')
 
-        pattern = re.compile(r'function\s+' + re.escape(function_name) + r'\s*\((.*?)\)\s*\{([\s\S]*?)\}', re.MULTILINE)
-        match = pattern.search(file_content)
-
-        if match:
-            full_function = f"function {function_name}({match.group(1)}) {{{match.group(2)}}}"
-            return full_function
+        # create ast from file content
+        ast = create_temp_file_and_get_ast(file_content)
+
+        try:
+            # Define a function to recursively search for the function
+            def find_function(node, function_name):
+                if not isinstance(node, dict):
+                    return None
+
+                # Handle different types of function nodes
+                if node.get('type') == 'FunctionDeclaration' and node.get('id', {}).get('name') == function_name:
+                    return node.get('start'), node.get('end')
+
+                if node.get('type') == 'VariableDeclarator':
+                    init_node = node.get('init')
+                    if isinstance(init_node, dict) and init_node.get('type') in ['FunctionExpression', 'ArrowFunctionExpression']:
+                        if node.get('id', {}).get('name') == function_name:
+                            return node.get('start'), node.get('end')
+
+                # Recursive traversal
+                for key, value in node.items():
+                    if isinstance(value, dict):
+                        result = find_function(value, function_name)
+                        if result:
+                            return result
+                    elif isinstance(value, list):
+                        for item in value:
+                            result = find_function(item, function_name)
+                            if result:
+                                return result
+                return None
+
+            # Search for the function in the AST
+            start_end = find_function(ast['program'], function_name)  # Pass function_name here
+            if start_end:
+                start, end = start_end
+                return file_content[start:end]
+        except Exception as e:
+            print(f"Error processing AST: {e}")
 
         return None
 
     functions = {}
 
+
+
+
     for commit in merge_commits:
-        parent_commit = commit.parents[0]
-        diffs = commit.diff(parent_commit, create_patch=True)
-
-        for diff in diffs:
-            diff_content = diff.diff.decode('utf-8')
-            for func_name, _ in get_func_name(diff_content):
-                full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, diff.a_path)
-                if full_function:
-                    func_key = f"{diff.a_path}::{func_name}"
-                    if func_key not in functions:
-                        functions[func_key] = {
-                            'function_name': func_name,
-                            'merged_function': full_function,
-                            'commit': commit.hexsha,
-                            'changes_after_merge': 0,
-                            'latest_function': full_function,
-                            'time_first_merged': commit.authored_datetime,
-                            'file_path': diff.a_path
-                        }
-
-
-    for func_key, func_info in functions.items():
-        for commit in repo.iter_commits('main', reverse=True):  # Iterate from the oldest to newest
-            if commit.authored_datetime > func_info['time_first_merged']:
+        for file_path in commit.stats.files:
+            if file_path.endswith('.js'):
+                try:
+                    blob = commit.tree / file_path
+                    file_content = blob.data_stream.read().decode('utf-8')
+                    for func_name in get_functions_from_file(file_content):
+                        full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, file_path)
+                        if full_function:
+                            func_key = f"{file_path}::{func_name}"
+                            if func_key not in functions:
+                                functions[func_key] = {
+                                    'function_name': func_name,
+                                    'merged_function': full_function,
+                                    'commit': commit.hexsha,
+                                    'changes_after_merge': 0,
+                                    'latest_function': full_function,
+                                    'time_first_merged': commit.authored_datetime,
+                                    'file_path': file_path
+                                }
+                except Exception as e:
+                        print(f"Error processing commit {commit.hexsha}: {e}")
+                        continue
+
+    for commit in repo.iter_commits('main', reverse=True):  # Iterate from the oldest to newest commit
+        for file_path in commit.stats.files:
+            if file_path.endswith('.js'):
                 try:
-                    blob = commit.tree / func_info['file_path']
+                    blob = commit.tree / file_path
                     file_content = blob.data_stream.read().decode('utf-8')
-                    new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], func_info['file_path'])
-                    if new_content and new_content.strip() != func_info['latest_function'].strip():
-                        func_info['changes_after_merge'] += 1
-                        func_info['latest_function'] = new_content
-                except KeyError:
+                    current_functions = get_functions_from_file(file_content)
+
+                    for func_key, func_info in functions.items():
+                        if func_info['file_path'] == file_path:
+                            if func_info['function_name'] in current_functions:
+                                new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], file_path)
+                                if new_content and new_content.strip() != func_info['latest_function'].strip() and commit.authored_datetime > func_info['time_first_merged']:
+                                    func_info['changes_after_merge'] += 1
+                                    func_info['latest_function'] = new_content
+                except Exception as e:
+                    print(f"Error processing commit {commit.hexsha}: {e}")
                     continue
 
-    # Find the min and max changes after merge
-    min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
-    max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
-
-    # Normalize the change counts between -1 and 1
-    for func_key, func_info in functions.items():
-        if max_changes != min_changes:
-            normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1
-        else:
-            normalized_score = 0
-        func_info['score'] = normalized_score
+    # Normalize the change counts to a score between -1 and 1
+    functions = normalize_change_counts(functions)
 
     # Convert datetime objects to string before saving
     for func in functions.values():
@@ -97,5 +207,9 @@ def get_full_function_at_commit(repo, commit_hash, function_name, file_path):
         json.dump(functions, f, indent=4)
 
 if __name__ == '__main__':
-    # pass repo_path variable if you want to test on another repo other than default
-    get_function_data()
+    start_time = time.time()
+    get_function_data() #pass this variable if you want to run another repo than testRepo: repo_path='../inputData/elixirsolutions'
+    end_time = time.time()
+    elapsed_time = round((end_time - start_time) / 60, 2)  # convert to minutes and round to 2 decimal places
+    print('✅ Printed function data to outputData/test_function_changes.json ✅')
+    print(f'⏰ The program took {elapsed_time} minutes to run. ⏰')