Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abstract syntax trees #43

Merged
merged 23 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions .github/workflows/get_function_data_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,41 @@ on:

jobs:
test:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
with:
path: 'functionRetriever' # Specify a path for the main repo

- name: Clone testRepo repository
run: |
mkdir -p ${{ github.workspace }}/inputData # Create inputData directory
git clone https://github.com/RapidReview-ai/testRepo ${{ github.workspace }}/inputData/testRepo

- name: Create outputData directory
run: mkdir -p ${{ github.workspace }}/functionRetriever/outputData

- name: Set up Python 3.x
uses: actions/setup-python@v2
with:
python-version: '3.11.1'
- name: Install dependencies

- name: Set up Node.js
uses: actions/setup-node@v2
with:
node-version: '18.12.1'

- name: Install npm dependencies
run: npm install
working-directory: ${{ github.workspace }}/functionRetriever

- name: Install Python dependencies
run: |
pip install --upgrade pip
# Install any other dependencies your project requires:
# pip install -r requirements.txt
pip install -r requirements.txt
working-directory: ${{ github.workspace }}/functionRetriever

- name: Run tests
run: python test_get_function_data.py
working-directory: ${{ github.workspace }}/functionRetriever
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
.env
**.json
function_changes.json
test_function_changes.json
package-lock.json
path
.DS_Store
**/__pycache__
**/__pycache__
node_modules
temp.js
14 changes: 14 additions & 0 deletions babelParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
const babel = require('@babel/parser');
const fs = require('fs');

const code = fs.readFileSync(process.argv[2], 'utf8');

try {
const ast = babel.parse(code, {
sourceType: "module",
plugins: [],
});
console.log(JSON.stringify(ast));
} catch (error) {
console.error("Parsing error:", error);
}
4 changes: 2 additions & 2 deletions createEmbeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from qdrant_client import QdrantClient
from qdrant_client.models import CollectionDescription, Distance, VectorParams, Record

def embed_sample_functions():
def embed_sample_functions(repo_path):
# Initialize Qdrant Client
client = QdrantClient(host='localhost', port=6333)
# client = QdrantClient(":memory:")
Expand All @@ -17,7 +17,7 @@ def embed_sample_functions():
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load the JSON data from the file
json_file_path = 'outputData/function_changes.json' # depends on how you run the file, should be changed to be global and not local path
json_file_path = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json'
with open(json_file_path, 'r') as file:
json_data = json.load(file)

Expand Down
244 changes: 179 additions & 65 deletions getFunctionData.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,202 @@
import os
import git
import json
import re
import os
import time
import subprocess

def get_function_data(repo_path='../inputData/testRepo2'):
# Determine the output file based on the original repo_path
output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo2') else 'outputData/function_changes.json'

# Determine the directory where this script is located
def get_function_data(repo_path='../inputData/testRepo'):
output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json'
script_dir = os.path.dirname(os.path.abspath(__file__))

# Construct the path to your repository relative to the script's location
repo_path = os.path.join(script_dir, repo_path)
repo = git.Repo(repo_path)

# Pull the latest changes from the main branch
repo = git.Repo(repo_path)

repo.git.checkout('main')
repo.git.pull()

merge_commits = [commit for commit in repo.iter_commits('main') if commit.parents and len(commit.parents) > 1]
merge_commits.reverse() # Reverse the list to get the oldest merge commit first
merge_commits.reverse()

def create_temp_file_and_get_ast(file_content, temp_file_path='temp.js'):
with open(temp_file_path, 'w') as f:
f.write(file_content)
ast = get_ast_from_js(file_content, temp_file_path)
if os.path.exists(temp_file_path):
os.remove(temp_file_path) # Clean up the temporary file
return ast

def get_ast_from_js(file_content, temp_file_path):
with open(temp_file_path, 'w') as temp_file:
temp_file.write(file_content)
result = subprocess.run(['node', 'babelParser.js', temp_file_path], capture_output=True, text=True)
if result.stderr:
print("Error in parsing:", result.stderr)
return None
return json.loads(result.stdout)

def get_functions_from_file(file_content):

# create ast from file content
ast = create_temp_file_and_get_ast(file_content)

functions = []
try:
# Traverse the AST to find function declarations
def traverse(node):
if not isinstance(node, dict):
return

if 'type' in node:
# Check for arrow functions or function expressions assigned to variables
if node['type'] in ['VariableDeclarator'] and 'init' in node:
init_node = node['init']
if init_node and 'type' in init_node and init_node['type'] in ['FunctionExpression', 'ArrowFunctionExpression']:
function_name = None
if 'name' in node['id']:
function_name = node['id']['name']
if function_name:
functions.append(function_name)

# Existing checks for FunctionDeclaration, etc.
elif node['type'] in ['FunctionDeclaration', 'FunctionExpression', 'ArrowFunctionExpression']:
function_name = None
if 'id' in node and node['id'] is not None:
function_name = node['id']['name']
elif 'key' in node and 'name' in node['key']:
function_name = node['key']['name']
if function_name:
functions.append(function_name)

# Check for methods in classes
if node['type'] == 'MethodDefinition' and 'key' in node and node['key']['type'] == 'Identifier':
functions.append(node['key']['name'])

# Recursively traverse child nodes
for key, value in node.items():
if isinstance(value, dict):
traverse(value)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
traverse(item)

traverse(ast['program'])
except Exception as e:
print(f"Error processing AST: {e}")
return functions

def normalize_change_counts(functions):
# Find the min and max changes after merge
min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']

# Normalize the change counts between -1 and 1
for func_key, func_info in functions.items():
if max_changes != min_changes:
normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1
else:
normalized_score = 0
func_info['score'] = normalized_score

return functions

def get_func_name(diff):
pattern = re.compile(r'function\s+([^\(]+)\s*\(([^)]*)\)\s*{', re.MULTILINE)
return pattern.findall(diff)

def get_full_function_at_commit(repo, commit_hash, function_name, file_path):
commit = repo.commit(commit_hash)
blob = commit.tree / file_path
file_content = blob.data_stream.read().decode('utf-8')

pattern = re.compile(r'function\s+' + re.escape(function_name) + r'\s*\((.*?)\)\s*\{([\s\S]*?)\}', re.MULTILINE)
match = pattern.search(file_content)

if match:
full_function = f"function {function_name}({match.group(1)}) {{{match.group(2)}}}"
return full_function
# create ast from file content
ast = create_temp_file_and_get_ast(file_content)

try:
# Define a function to recursively search for the function
def find_function(node, function_name):
if not isinstance(node, dict):
return None

# Handle different types of function nodes
if node.get('type') == 'FunctionDeclaration' and node.get('id', {}).get('name') == function_name:
return node.get('start'), node.get('end')

if node.get('type') == 'VariableDeclarator':
init_node = node.get('init')
if isinstance(init_node, dict) and init_node.get('type') in ['FunctionExpression', 'ArrowFunctionExpression']:
if node.get('id', {}).get('name') == function_name:
return node.get('start'), node.get('end')

# Recursive traversal
for key, value in node.items():
if isinstance(value, dict):
result = find_function(value, function_name)
if result:
return result
elif isinstance(value, list):
for item in value:
result = find_function(item, function_name)
if result:
return result
return None

# Search for the function in the AST
start_end = find_function(ast['program'], function_name) # Pass function_name here
if start_end:
start, end = start_end
return file_content[start:end]
except Exception as e:
print(f"Error processing AST: {e}")

return None

functions = {}




for commit in merge_commits:
parent_commit = commit.parents[0]
diffs = commit.diff(parent_commit, create_patch=True)

for diff in diffs:
diff_content = diff.diff.decode('utf-8')
for func_name, _ in get_func_name(diff_content):
full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, diff.a_path)
if full_function:
func_key = f"{diff.a_path}::{func_name}"
if func_key not in functions:
functions[func_key] = {
'function_name': func_name,
'merged_function': full_function,
'commit': commit.hexsha,
'changes_after_merge': 0,
'latest_function': full_function,
'time_first_merged': commit.authored_datetime,
'file_path': diff.a_path
}


for func_key, func_info in functions.items():
for commit in repo.iter_commits('main', reverse=True): # Iterate from the oldest to newest
if commit.authored_datetime > func_info['time_first_merged']:
for file_path in commit.stats.files:
if file_path.endswith('.js'):
try:
blob = commit.tree / file_path
file_content = blob.data_stream.read().decode('utf-8')
for func_name in get_functions_from_file(file_content):
full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, file_path)
if full_function:
func_key = f"{file_path}::{func_name}"
if func_key not in functions:
functions[func_key] = {
'function_name': func_name,
'merged_function': full_function,
'commit': commit.hexsha,
'changes_after_merge': 0,
'latest_function': full_function,
'time_first_merged': commit.authored_datetime,
'file_path': file_path
}
except Exception as e:
print(f"Error processing commit {commit.hexsha}: {e}")
continue

for commit in repo.iter_commits('main', reverse=True): # Iterate from the oldest to newest commit
for file_path in commit.stats.files:
if file_path.endswith('.js'):
try:
blob = commit.tree / func_info['file_path']
blob = commit.tree / file_path
file_content = blob.data_stream.read().decode('utf-8')
new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], func_info['file_path'])
if new_content and new_content.strip() != func_info['latest_function'].strip():
func_info['changes_after_merge'] += 1
func_info['latest_function'] = new_content
except KeyError:
current_functions = get_functions_from_file(file_content)

for func_key, func_info in functions.items():
if func_info['file_path'] == file_path:
if func_info['function_name'] in current_functions:
new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], file_path)
if new_content and new_content.strip() != func_info['latest_function'].strip() and commit.authored_datetime > func_info['time_first_merged']:
func_info['changes_after_merge'] += 1
func_info['latest_function'] = new_content
except Exception as e:
print(f"Error processing commit {commit.hexsha}: {e}")
continue

# Find the min and max changes after merge
min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']

# Normalize the change counts between -1 and 1
for func_key, func_info in functions.items():
if max_changes != min_changes:
normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1
else:
normalized_score = 0
func_info['score'] = normalized_score
# Normalize the change counts to a score between -1 and 1
functions = normalize_change_counts(functions)

# Convert datetime objects to string before saving
for func in functions.values():
Expand All @@ -97,5 +207,9 @@ def get_full_function_at_commit(repo, commit_hash, function_name, file_path):
json.dump(functions, f, indent=4)

if __name__ == '__main__':
# pass repo_path variable if you want to test on another repo other than default
get_function_data()
start_time = time.time()
get_function_data() #pass this variable if you want to run another repo than testRepo: repo_path='../inputData/elixirsolutions'
end_time = time.time()
elapsed_time = round((end_time - start_time) / 60, 2) # convert to minutes and round to 2 decimal places
print('✅ Printed function data to outputData/test_function_changes.json ✅')
print(f'⏰ The program took {elapsed_time} minutes to run. ⏰')
Loading
Loading