diff --git a/cmd/katana/main.go b/cmd/katana/main.go index 1be5933d..c2ce8925 100644 --- a/cmd/katana/main.go +++ b/cmd/katana/main.go @@ -15,6 +15,7 @@ import ( "github.com/projectdiscovery/katana/internal/runner" "github.com/projectdiscovery/katana/pkg/output" "github.com/projectdiscovery/katana/pkg/types" + "github.com/projectdiscovery/katana/pkg/tfidf" errorutil "github.com/projectdiscovery/utils/errors" fileutil "github.com/projectdiscovery/utils/file" folderutil "github.com/projectdiscovery/utils/folder" @@ -22,78 +23,59 @@ import ( ) var ( - cfgFile string - options = &types.Options{} + cfgFile string + options = &types.Options{} + useDynamicScope bool + tfidfModel *tfidf.TfIdf + similarityThreshold float64 = 0.7 ) func main() { flagSet, err := readFlags() - if err != nil { - gologger.Fatal().Msgf("Could not read flags: %s\n", err) - } + handleError("Could not read flags", err) if options.HealthCheck { gologger.Print().Msgf("%s\n", runner.DoHealthCheck(options, flagSet)) os.Exit(0) } - katanaRunner, err := runner.New(options) - if err != nil || katanaRunner == nil { - if options.Version { - return - } - gologger.Fatal().Msgf("could not create runner: %s\n", err) + // Initialize the TF-IDF model if dynamic scoping is enabled + if useDynamicScope { + tfidfModel = tfidf.New() } + + katanaRunner, err := runner.New(options) + handleError("could not create runner", err) defer katanaRunner.Close() - // close handler resumeFilename := defaultResumeFilename() - go func() { - c := make(chan os.Signal, 1) - signal.Notify(c, os.Interrupt, syscall.SIGTERM) - for range c { - gologger.DefaultLogger.Info().Msg("- Ctrl+C pressed in Terminal") - katanaRunner.Close() - - gologger.Info().Msgf("Creating resume file: %s\n", resumeFilename) - err := katanaRunner.SaveState(resumeFilename) - if err != nil { - gologger.Error().Msgf("Couldn't create resume file: %s\n", err) - } - - os.Exit(0) - } - }() - - if err := katanaRunner.ExecuteCrawling(); err != nil { - gologger.Fatal().Msgf("could not execute crawling: %s", err) - } + setupCloseHandler(katanaRunner, resumeFilename) - // on successful execution: + err = katanaRunner.ExecuteCrawling() + handleError("could not execute crawling", err) - // deduplicate the lines in each file in the store-field-dir - //use options.StoreFieldDir once https://github.com/projectdiscovery/katana/pull/877 is merged + // Deduplicate lines in each file in the store-field-dir storeFieldDir := "katana_field" _ = folderutil.DedupeLinesInFiles(storeFieldDir) - // remove the resume file in case it exists + // Remove the resume file if it exists if fileutil.FileExists(resumeFilename) { os.Remove(resumeFilename) } - } func readFlags() (*goflags.FlagSet, error) { flagSet := goflags.NewFlagSet() - flagSet.SetDescription(`Katana is a fast crawler focused on execution in automation -pipelines offering both headless and non-headless crawling.`) + flagSet.SetDescription(`Katana is a fast crawler focused on execution in automation pipelines offering both headless and non-headless crawling.`) + // Input group flagSet.CreateGroup("input", "Input", flagSet.StringSliceVarP(&options.URLs, "list", "u", nil, "target url / list to crawl", goflags.FileCommaSeparatedStringSliceOptions), flagSet.StringVar(&options.Resume, "resume", "", "resume scan using resume.cfg"), flagSet.StringSliceVarP(&options.Exclude, "exclude", "e", nil, "exclude host matching specified filter ('cdn', 'private-ips', cidr, ip, regex)", goflags.CommaSeparatedStringSliceOptions), ) + // Configuration group flagSet.CreateGroup("config", "Configuration", flagSet.StringSliceVarP(&options.Resolvers, "resolvers", "r", nil, "list of custom resolver (file or comma separated)", goflags.FileCommaSeparatedStringSliceOptions), flagSet.IntVarP(&options.MaxDepth, "depth", "d", 3, "maximum depth to crawl"), @@ -115,13 +97,16 @@ pipelines offering both headless and non-headless crawling.`) flagSet.BoolVarP(&options.IgnoreQueryParams, "ignore-query-params", "iqp", false, "Ignore crawling same path with different query-param values"), flagSet.BoolVarP(&options.TlsImpersonate, "tls-impersonate", "tlsi", false, "enable experimental client hello (ja3) tls randomization"), flagSet.BoolVarP(&options.DisableRedirects, "disable-redirects", "dr", false, "disable following redirects (default false)"), + flagSet.BoolVarP(&options.UseDynamicScope, "use-dynamic-scope", "uds", false, "Use dynamic scoping to avoid crawling similar pages"), ) + // Debug group flagSet.CreateGroup("debug", "Debug", flagSet.BoolVarP(&options.HealthCheck, "hc", "health-check", false, "run diagnostic check up"), flagSet.StringVarP(&options.ErrorLogFile, "error-log", "elog", "", "file to write sent requests error log"), ) + // Headless group flagSet.CreateGroup("headless", "Headless", flagSet.BoolVarP(&options.Headless, "headless", "hl", false, "enable headless hybrid crawling (experimental)"), flagSet.BoolVarP(&options.UseInstalledChrome, "system-chrome", "sc", false, "use local installed chrome browser instead of katana installed"), @@ -135,6 +120,7 @@ pipelines offering both headless and non-headless crawling.`) flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr request url,method in jsonl output"), ) + // Scope group flagSet.CreateGroup("scope", "Scope", flagSet.StringSliceVarP(&options.Scope, "crawl-scope", "cs", nil, "in scope url regex to be followed by crawler", goflags.FileCommaSeparatedStringSliceOptions), flagSet.StringSliceVarP(&options.OutOfScope, "crawl-out-scope", "cos", nil, "out of scope url regex to be excluded by crawler", goflags.FileCommaSeparatedStringSliceOptions), @@ -155,6 +141,7 @@ pipelines offering both headless and non-headless crawling.`) flagSet.StringVarP(&options.OutputFilterCondition, "filter-condition", "fdc", "", "filter response with dsl based condition"), ) + // Rate-Limit group flagSet.CreateGroup("ratelimit", "Rate-Limit", flagSet.IntVarP(&options.Concurrency, "concurrency", "c", 10, "number of concurrent fetchers to use"), flagSet.IntVarP(&options.Parallelism, "parallelism", "p", 10, "number of concurrent inputs to process"), @@ -163,11 +150,13 @@ pipelines offering both headless and non-headless crawling.`) flagSet.IntVarP(&options.RateLimitMinute, "rate-limit-minute", "rlm", 0, "maximum number of requests to send per minute"), ) + // Update group flagSet.CreateGroup("update", "Update", flagSet.CallbackVarP(runner.GetUpdateCallback(), "update", "up", "update katana to latest version"), flagSet.BoolVarP(&options.DisableUpdateCheck, "disable-update-check", "duc", false, "disable automatic katana update check"), ) + // Output group flagSet.CreateGroup("output", "Output", flagSet.StringVarP(&options.OutputFile, "output", "o", "", "file to write output to"), flagSet.BoolVarP(&options.StoreResponse, "store-response", "sr", false, "store http requests/responses"), @@ -207,19 +196,33 @@ func init() { func defaultResumeFilename() string { homedir, err := os.UserHomeDir() - if err != nil { - gologger.Fatal().Msgf("could not get home directory: %s", err) - } + handleError("could not get home directory", err) configDir := filepath.Join(homedir, ".config", "katana") return filepath.Join(configDir, fmt.Sprintf("resume-%s.cfg", xid.New().String())) } -// cleanupOldResumeFiles cleans up resume files older than 10 days. +func setupCloseHandler(runner *runner.Runner, resumeFilename string) { + go func() { + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt, syscall.SIGTERM) + for range c { + gologger.DefaultLogger.Info().Msg("- Ctrl+C pressed in Terminal") + runner.Close() + + gologger.Info().Msgf("Creating resume file: %s\n", resumeFilename) + err := runner.SaveState(resumeFilename) + if err != nil { + gologger.Error().Msgf("Couldn't create resume file: %s\n", err) + } + + os.Exit(0) + } + }() +} + func cleanupOldResumeFiles() { homedir, err := os.UserHomeDir() - if err != nil { - gologger.Fatal().Msgf("could not get home directory: %s", err) - } + handleError("could not get home directory", err) root := filepath.Join(homedir, ".config", "katana") filter := fileutil.FileFilters{ OlderThan: 24 * time.Hour * 10, // cleanup on the 10th day @@ -227,3 +230,9 @@ func cleanupOldResumeFiles() { } _ = fileutil.DeleteFilesOlderThan(root, filter) } + +func handleError(message string, err error) { + if err != nil { + gologger.Fatal().Msgf("%s: %s\n", message, err) + } +} diff --git a/internal/runner/runner.go b/internal/runner/runner.go index 9fcd09fe..02bef747 100644 --- a/internal/runner/runner.go +++ b/internal/runner/runner.go @@ -7,6 +7,7 @@ import ( "github.com/projectdiscovery/gologger" "github.com/projectdiscovery/katana/pkg/engine" + "github.com/projectdiscovery/katana/pkg/engine/dynamic" // Import the dynamic package "github.com/projectdiscovery/katana/pkg/engine/hybrid" "github.com/projectdiscovery/katana/pkg/engine/parser" "github.com/projectdiscovery/katana/pkg/engine/standard" @@ -95,6 +96,8 @@ func New(options *types.Options) (*Runner, error) { var crawler engine.Engine switch { + case options.UseDynamicScope: // Add this case for dynamic scoping + crawler, err = dynamic.New(crawlerOptions) case options.Headless: crawler, err = hybrid.New(crawlerOptions) default: diff --git a/pkg/engine/dynamic/crawl.go b/pkg/engine/dynamic/crawl.go new file mode 100644 index 00000000..6c2470db --- /dev/null +++ b/pkg/engine/dynamic/crawl.go @@ -0,0 +1,118 @@ +package dynamic + +import ( + "bytes" + "context" + "io" + "net/http" + "net/http/httputil" + "net/url" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/projectdiscovery/katana/pkg/engine/common" + "github.com/projectdiscovery/katana/pkg/navigation" + "github.com/projectdiscovery/katana/pkg/utils" + "github.com/projectdiscovery/retryablehttp-go" + errorutil "github.com/projectdiscovery/utils/errors" + mapsutil "github.com/projectdiscovery/utils/maps" +) + +func (c *Crawler) makeRequest(s *common.CrawlSession, request *navigation.Request) (*navigation.Response, error) { + response := &navigation.Response{ + Depth: request.Depth + 1, + RootHostname: s.Hostname, + } + ctx := context.WithValue(s.Ctx, navigation.Depth{}, request.Depth) + httpReq, err := http.NewRequestWithContext(ctx, request.Method, request.URL, nil) + if err != nil { + return response, err + } + if request.Body != "" && request.Method != "GET" { + httpReq.Body = io.NopCloser(strings.NewReader(request.Body)) + } + req, err := retryablehttp.FromRequest(httpReq) + if err != nil { + return response, err + } + req.Header.Set("User-Agent", utils.WebUserAgent()) + + for k, v := range request.Headers { + req.Header.Set(k, v) + if k == "Host" { + req.Host = v + } + } + for k, v := range c.Headers { + req.Header.Set(k, v) + if k == "Host" { + req.Host = v + } + } + + resp, err := s.HttpClient.Do(req) + if resp != nil { + defer func() { + if resp.Body != nil && resp.StatusCode != http.StatusSwitchingProtocols { + _, _ = io.Copy(io.Discard, resp.Body) + } + _ = resp.Body.Close() + }() + } + + rawRequestBytes, _ := req.Dump() + request.Raw = string(rawRequestBytes) + + if err != nil { + return response, err + } + if resp.StatusCode == http.StatusSwitchingProtocols { + return response, nil + } + limitReader := io.LimitReader(resp.Body, int64(c.Options.Options.BodyReadSize)) + data, err := io.ReadAll(limitReader) + if err != nil { + return response, err + } + if !c.Options.UniqueFilter.UniqueContent(data) { + return &navigation.Response{}, nil + } + + // Dynamic scoping logic: update the TF-IDF model and check for similarity + words := strings.Fields(string(data)) + scores := tfidfModel.Calculate(words) + + for _, score := range scores { + if score > similarityThreshold { + return &navigation.Response{}, nil + } + } + + tfidfModel.AddDocument(request.URL, words) + + technologies := c.Options.Wappalyzer.Fingerprint(resp.Header, data) + response.Technologies = mapsutil.GetKeys(technologies) + + resp.Body = io.NopCloser(strings.NewReader(string(data))) + + response.Body = string(data) + response.Resp = resp + response.Reader, err = goquery.NewDocumentFromReader(bytes.NewReader(data)) + response.Reader.Url, _ = url.Parse(request.URL) + response.StatusCode = resp.StatusCode + response.Headers = utils.FlattenHeaders(resp.Header) + if c.Options.Options.FormExtraction { + response.Forms = append(response.Forms, utils.ParseFormFields(response.Reader)...) + } + + resp.ContentLength = int64(len(data)) + + rawResponseBytes, _ := httputil.DumpResponse(resp, true) + response.Raw = string(rawResponseBytes) + + if err != nil { + return response, errorutil.NewWithTag("dynamic", "could not make document from reader").Wrap(err) + } + + return response, nil +} diff --git a/pkg/engine/dynamic/dynamic.go b/pkg/engine/dynamic/dynamic.go new file mode 100644 index 00000000..684e0f3a --- /dev/null +++ b/pkg/engine/dynamic/dynamic.go @@ -0,0 +1,48 @@ +package dynamic + +import ( + "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/katana/pkg/engine/common" + "github.com/projectdiscovery/katana/pkg/tfidf" // Import the new tfidf package + "github.com/projectdiscovery/katana/pkg/types" + errorutil "github.com/projectdiscovery/utils/errors" +) + +var ( + tfidfModel *tfidf.TfIdf + similarityThreshold float64 = 0.7 +) + +// Crawler is a dynamic crawler instance +type Crawler struct { + *common.Shared +} + +// New returns a new dynamic crawler instance +func New(options *types.CrawlerOptions) (*Crawler, error) { + shared, err := common.NewShared(options) + if err != nil { + return nil, errorutil.NewWithErr(err).WithTag("dynamic") + } + tfidfModel = tfidf.New() + return &Crawler{Shared: shared}, nil +} + +// Close closes the crawler process +func (c *Crawler) Close() error { + return nil +} + +// Crawl crawls a URL with the specified options +func (c *Crawler) Crawl(rootURL string) error { + crawlSession, err := c.NewCrawlSessionWithURL(rootURL) + if err != nil { + return errorutil.NewWithErr(err).WithTag("dynamic") + } + defer crawlSession.CancelFunc() + gologger.Info().Msgf("Started dynamic crawling for => %v", rootURL) + if err := c.Do(crawlSession, c.makeRequest); err != nil { + return errorutil.NewWithErr(err).WithTag("dynamic") + } + return nil +} diff --git a/pkg/engine/parser/parser.go b/pkg/engine/parser/parser.go index cb2fd378..6f4d11f6 100644 --- a/pkg/engine/parser/parser.go +++ b/pkg/engine/parser/parser.go @@ -399,7 +399,7 @@ func bodyVideoTagParser(resp *navigation.Response) (navigationRequests []*naviga return } -// bodyBlockquoteCiteTagParser parses blockquote cite tag from response +// bodyButtonFormactionTagParser parses blockquote cite tag from response func bodyBlockquoteCiteTagParser(resp *navigation.Response) (navigationRequests []*navigation.Request) { resp.Reader.Find("blockquote[cite]").Each(func(i int, item *goquery.Selection) { src, ok := item.Attr("cite") @@ -488,6 +488,12 @@ func bodyHtmlDoctypeTagParser(resp *navigation.Response) (navigationRequests []* if len(docTypeNode.Attr) == 0 || strings.ToLower(docTypeNode.Attr[0].Key) != "system" { return } + if len(docTypeNode.Attr) == 0 || strings.ToLower(docTypeNode.Attr[0].Key) != "system" { + return + } + if len(docTypeNode.Attr) == 0 || strings.ToLower(docTypeNode.Attr[0].Key) != "system" { + return + } navigationRequests = append(navigationRequests, navigation.NewNavigationRequestURLFromResponse(docTypeNode.Attr[0].Val, resp.Resp.Request.URL.String(), "html", "doctype", resp)) return } diff --git a/pkg/tfidf/tfidf.go b/pkg/tfidf/tfidf.go new file mode 100644 index 00000000..63f5be4c --- /dev/null +++ b/pkg/tfidf/tfidf.go @@ -0,0 +1,62 @@ +package tfidf + +import ( + "math" + "strings" + "sync" +) + +type TfIdf struct { + documents map[string]map[string]int + docFreq map[string]int + totalDocs int + mutex sync.Mutex +} + +func New() *TfIdf { + return &TfIdf{ + documents: make(map[string]map[string]int), + docFreq: make(map[string]int), + } +} + +func (t *TfIdf) AddDocument(docID string, words []string) { + t.mutex.Lock() + defer t.mutex.Unlock() + + wordCount := make(map[string]int) + for _, word := range words { + word = strings.ToLower(word) + wordCount[word]++ + } + + t.documents[docID] = wordCount + t.totalDocs++ + + for word := range wordCount { + t.docFreq[word]++ + } +} + +func (t *TfIdf) Calculate(words []string) []float64 { + t.mutex.Lock() + defer t.mutex.Unlock() + + wordCount := make(map[string]int) + for _, word := range words { + word = strings.ToLower(word) + wordCount[word]++ + } + + var scores []float64 + for range t.documents { + score := 0.0 + for word, count := range wordCount { + tf := float64(count) / float64(len(words)) + idf := math.Log(float64(t.totalDocs) / (1 + float64(t.docFreq[word]))) + score += tf * idf + } + scores = append(scores, score) + } + return scores +} diff --git a/pkg/types/options.go b/pkg/types/options.go index 43e6e8f3..1f7a65a0 100644 --- a/pkg/types/options.go +++ b/pkg/types/options.go @@ -155,6 +155,8 @@ type Options struct { TlsImpersonate bool //DisableRedirects disables the following of redirects DisableRedirects bool + //Dynamic Scope + UseDynamicScope bool } func (options *Options) ParseCustomHeaders() map[string]string {