Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Frontend] Generalise SourceCodeFile class #1997

Merged
merged 4 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 71 additions & 6 deletions src/fuzz_introspector/frontends/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,78 @@
#
################################################################################

from typing import Any, Optional
from typing import Any, Optional, Generic, TypeVar

from tree_sitter import Language, Parser
import tree_sitter_c
import tree_sitter_cpp
import tree_sitter_go
import tree_sitter_java
import tree_sitter_rust

class Project():
import logging

logger = logging.getLogger(name=__name__)

T = TypeVar('T', bound='SourceCodeFile')


class SourceCodeFile():
"""Class for holding file-specific information."""
LANGUAGE: dict[str, Language] = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LANGUAGE is too generic of a name here

'c': Language(tree_sitter_c.language()),
'cpp': Language(tree_sitter_cpp.language()),
'c++': Language(tree_sitter_cpp.language()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should use language constants from constants.py here

'go': Language(tree_sitter_go.language()),
'jvm': Language(tree_sitter_java.language()),
'rust': Language(tree_sitter_rust.language()),
}

def __init__(self,
language: str,
source_file: str,
entrypoint: str = '',
source_content: Optional[bytes] = None):
logger.info('Processing %s' % source_file)

self.root = None
self.source_file = source_file
self.language = language
self.entrypoint = entrypoint
self.tree_sitter_lang = self.LANGUAGE.get(language)
self.parser = Parser(self.tree_sitter_lang)

if source_content:
self.source_content = source_content
else:
with open(self.source_file, 'rb') as f:
self.source_content = f.read()

# Initialization ruotines
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo

self.load_tree()

# Language specific process
self.language_specific_process()

def load_tree(self):
"""Load the the source code into a treesitter tree, and set
the root node."""
if not self.root:
self.root = self.parser.parse(self.source_content).root_node

def language_specific_process(self):
"""Dummy function to perform some specific processes in subclasses."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

am not sure if we should use dummy here, or just give documentation as how classes inherit from here should use the function

pass

def has_libfuzzer_harness(self) -> bool:
"""Dummy function for source code files."""
return False


class Project(Generic[T]):
"""Wrapper for doing analysis of a collection of source files."""

def __init__(self, source_code_files: list[Any]):
def __init__(self, source_code_files: list[T]):
self.source_code_files = source_code_files

def dump_module_logic(self,
Expand All @@ -35,7 +100,7 @@ def dump_module_logic(self,

def extract_calltree(self,
source_file: str = '',
source_code: Optional[Any] = None,
source_code: Optional[T] = None,
function: Optional[str] = None,
visited_functions: Optional[set[str]] = None,
depth: int = 0,
Expand All @@ -48,14 +113,14 @@ def extract_calltree(self,
def get_reachable_functions(
self,
source_file: str = '',
source_code: Optional[Any] = None,
source_code: Optional[T] = None,
function: Optional[str] = None,
visited_functions: Optional[set[str]] = None) -> set[str]:
"""Get a list of reachable functions for a provided function name."""
# Dummy function for subclasses
return set()

def get_source_codes_with_harnesses(self) -> list[Any]:
def get_source_codes_with_harnesses(self) -> list[T]:
"""Gets the source codes that holds libfuzzer harnesses."""
harnesses = []
for source_code in self.source_code_files:
Expand Down
69 changes: 25 additions & 44 deletions src/fuzz_introspector/frontends/frontend_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,23 @@
################################################################################
"""Fuzz Introspector Light frontend"""

import os
from typing import Any, Optional

import os
import logging

from tree_sitter import Language, Parser
import tree_sitter_c
import yaml

from typing import Any, Optional, Set

from fuzz_introspector.frontends.datatypes import Project
from fuzz_introspector.frontends.datatypes import Project, SourceCodeFile

logger = logging.getLogger(name=__name__)

tree_sitter_languages = {'c': Language(tree_sitter_c.language())}

language_parsers = {'c': Parser(Language(tree_sitter_c.language()))}


class CProject(Project):
class CProject(Project['CSourceCodeFile']):
"""Wrapper for doing analysis of a collection of source files."""

def __init__(self, source_code_files: list['CSourceCodeFile']):
super().__init__(source_code_files)

def dump_module_logic(self,
report_name,
entry_function: str = '',
Expand Down Expand Up @@ -86,8 +81,8 @@ def dump_module_logic(self,
'functionLinenumberEnd'] = func_def.root.end_point.row
func_dict['linkageType'] = ''
func_dict['func_position'] = {
'start': source_code.root.start_point.row,
'end': source_code.root.end_point.row,
'start': func_def.root.start_point.row,
'end': func_def.root.end_point.row,
}
cc_str = 'CyclomaticComplexity'
func_dict[cc_str] = func_def.get_function_complexity()
Expand Down Expand Up @@ -130,9 +125,12 @@ def get_source_code_with_target(self, target_func_name):
return source_code
return None

def get_source_codes_with_harnesses(self) -> list['CSourceCodeFile']:
return super().get_source_codes_with_harnesses()

def extract_calltree(self,
source_file: str = '',
source_code: Optional[Any] = None,
source_code: Optional['CSourceCodeFile'] = None,
function: Optional[str] = None,
visited_functions: Optional[set[str]] = None,
depth: int = 0,
Expand Down Expand Up @@ -182,9 +180,9 @@ def extract_calltree(self,
def get_reachable_functions(
self,
source_file: str = '',
source_code: Optional[Any] = None,
source_code: Optional['CSourceCodeFile'] = None,
function: Optional[str] = None,
visited_functions: Optional[set[str]] = None) -> Set[str]:
visited_functions: Optional[set[str]] = None) -> set[str]:
"""Gets the reachable frunctions from a given function."""
# Create calltree from a given function
# Find the function in the source code
Expand Down Expand Up @@ -456,28 +454,17 @@ def callsites(self):
return callsites


class SourceCodeFile():
class CSourceCodeFile(SourceCodeFile):
"""Class for holding file-specific information."""

def __init__(self, source_file, language, source_content=""):
self.source_file = source_file
self.language = language
self.parser = language_parsers.get(self.language)
self.tree_sitter_lang = tree_sitter_languages[self.language]

self.root = None
def language_specific_process(self):
"""Perform some language specific processes in subclasses."""
self.function_names = []
self.line_range_pairs = []
self.struct_defs = []
self.typedefs = []
self.includes = set()

if source_content:
self.source_content = source_content
else:
with open(self.source_file, 'rb') as f:
self.source_content = f.read()

# List of function definitions in the source file.
self.func_defs = []

Expand All @@ -488,12 +475,6 @@ def __init__(self, source_file, language, source_content=""):
self._set_function_defintions()
self.extract_types()

def load_tree(self) -> None:
"""Load the the source code into a treesitter tree, and set
the root node."""
if self.language == 'c' and not self.root:
self.root = self.parser.parse(self.source_content).root_node

def extract_types(self):
"""Extracts the types of the source code"""
# Extract all structs
Expand Down Expand Up @@ -640,28 +621,28 @@ def get_linenumber(self, bytepos):


def load_treesitter_trees(source_files: list[str],
is_log: bool = True) -> list[SourceCodeFile]:
is_log: bool = True) -> CProject:
"""Creates treesitter trees for all files in a given list of source files."""
results = []

for code_file in source_files:
if not os.path.isfile(code_file):
continue

source_cls = SourceCodeFile(code_file, 'c')
source_cls = CSourceCodeFile('c', code_file)

if is_log:
if source_cls.has_libfuzzer_harness():
logger.info('harness: %s', code_file)

results.append(source_cls)

return results
return CProject(results)


def analyse_source_code(source_content: str) -> SourceCodeFile:
def analyse_source_code(source_content: str) -> CSourceCodeFile:
"""Returns a source abstraction based on a single source string."""
source_code = SourceCodeFile(source_file='in-memory string',
language='c',
source_content=source_content.encode())
source_code = CSourceCodeFile('c',
source_file='in-memory string',
source_content=source_content.encode())
return source_code
Loading
Loading