diff --git a/setup.cfg b/setup.cfg index 71d62d573..3bc9c3023 100644 --- a/setup.cfg +++ b/setup.cfg @@ -98,10 +98,12 @@ install_requires = django-rq==2.10.1 rq-scheduler==0.13.1 + # redhat pipeline + extractcode[full]==31.0.0 + #vulntotal python-dotenv==0.20.0 texttable==1.6.4 - extractcode[full]==31.0.0 #hashid uritemplate==4.2.0 diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 439e69731..c0cf04ed7 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -32,6 +32,7 @@ from vulnerabilities.importers import ubuntu_usn from vulnerabilities.importers import vulnrichment from vulnerabilities.importers import xen +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 from vulnerabilities.pipelines import alpine_linux_importer from vulnerabilities.pipelines import github_importer from vulnerabilities.pipelines import gitlab_importer @@ -189,3 +190,9 @@ collect_fix_commits_v2.CollectGitlabFixCommitsPipeline, ] ) + +TODO_EXCLUDED_PIPELINES = [ + key + for key, value in IMPORTERS_REGISTRY.items() + if issubclass(value, VulnerableCodeBaseImporterPipelineV2) and value.exclude_from_package_todo +] diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index 11fa5126a..c55c14c8a 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -19,6 +19,7 @@ from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline from vulnerabilities.pipelines import remove_duplicate_advisories from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees +from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2 from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2 from vulnerabilities.pipelines.v2_improvers import ( computer_package_version_rank as compute_version_rank_v2, @@ -72,5 +73,6 @@ collect_ssvc_trees.CollectSSVCPipeline, relate_severities.RelateSeveritiesPipeline, group_advisories_for_packages.GroupAdvisoriesForPackages, + compute_advisory_todo_v2.ComputeToDo, ] ) diff --git a/vulnerabilities/migrations/0124_advisorytodov2_todorelatedadvisoryv2_and_more.py b/vulnerabilities/migrations/0124_advisorytodov2_todorelatedadvisoryv2_and_more.py new file mode 100644 index 000000000..48f14fc2c --- /dev/null +++ b/vulnerabilities/migrations/0124_advisorytodov2_todorelatedadvisoryv2_and_more.py @@ -0,0 +1,177 @@ +# Generated by Django 5.2.11 on 2026-04-27 03:35 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("vulnerabilities", "0123_alter_packagev2_options_alter_packagev2_package_url_and_more"), + ] + + operations = [ + migrations.DeleteModel( + name="AdvisoryToDoV2", + ), + migrations.DeleteModel( + name="ToDoRelatedAdvisoryV2", + ), + migrations.CreateModel( + name="AdvisoryToDoV2", + fields=[ + ( + "todo_id", + models.UUIDField( + default=uuid.uuid4, editable=False, primary_key=True, serialize=False + ), + ), + ( + "related_advisories_id", + models.CharField( + help_text="SHA1 digest of the unique_content_id field of the applicable advisories.", + max_length=40, + ), + ), + ( + "alias", + models.CharField( + db_index=True, + help_text="Alias associated with TODO advisories", + max_length=50, + ), + ), + ( + "advisories_count", + models.IntegerField( + db_index=True, + default=1, + help_text="Number of advisory associated with this TODO.", + ), + ), + ( + "issue_type", + models.CharField( + choices=[ + ("MISSING_AFFECTED_PACKAGE", "Advisory is missing affected package"), + ("MISSING_FIXED_BY_PACKAGE", "Advisory is missing fixed-by package"), + ( + "MISSING_AFFECTED_AND_FIXED_BY_PACKAGES", + "Advisory is missing both affected and fixed-by packages", + ), + ("MISSING_SUMMARY", "Advisory is missing summary"), + ( + "CONFLICTING_FIXED_BY_PACKAGES", + "Advisories have conflicting fixed-by packages", + ), + ( + "CONFLICTING_AFFECTED_PACKAGES", + "Advisories have conflicting affected packages", + ), + ( + "CONFLICTING_AFFECTED_AND_FIXED_BY_PACKAGES", + "Advisories have conflicting affected and fixed-by packages", + ), + ( + "CONFLICTING_SEVERITY_SCORES", + "Advisories have conflicting severity scores", + ), + ], + db_index=True, + help_text="Select the issue that needs to be addressed from the available options.", + max_length=50, + ), + ), + ( + "issue_detail", + models.TextField(blank=True, help_text="Additional details about the issue."), + ), + ( + "created_at", + models.DateTimeField( + auto_now_add=True, + help_text="Timestamp indicating when this TODO was created.", + ), + ), + ( + "is_resolved", + models.BooleanField( + db_index=True, default=False, help_text="This TODO is resolved or not." + ), + ), + ( + "resolved_at", + models.DateTimeField( + blank=True, + help_text="Timestamp indicating when this TODO was resolved.", + null=True, + ), + ), + ( + "resolution_detail", + models.TextField( + blank=True, help_text="Additional detail on how this TODO was resolved." + ), + ), + ( + "oldest_advisory_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp indicating when the oldest advisory was published, used for triaging TODOs.", + null=True, + ), + ), + ( + "is_todo_stale", + models.BooleanField( + db_index=True, + default=False, + help_text="TODOs are marked stale if associate advisory is no longer the latest version of the advisory.", + ), + ), + ], + ), + migrations.CreateModel( + name="ToDoRelatedAdvisoryV2", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + ), + ), + ( + "advisory", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="vulnerabilities.advisoryv2" + ), + ), + ( + "todo", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="vulnerabilities.advisorytodov2", + ), + ), + ], + options={ + "unique_together": {("todo", "advisory")}, + }, + ), + migrations.AddField( + model_name="advisorytodov2", + name="advisories", + field=models.ManyToManyField( + help_text="Advisory/ies where this TODO is applicable.", + related_name="advisory_todos", + through="vulnerabilities.ToDoRelatedAdvisoryV2", + to="vulnerabilities.advisoryv2", + ), + ), + migrations.AlterUniqueTogether( + name="advisorytodov2", + unique_together={("related_advisories_id", "issue_type")}, + ), + ] diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py index 4efc04766..6d0b6fbb4 100644 --- a/vulnerabilities/models.py +++ b/vulnerabilities/models.py @@ -2521,9 +2521,21 @@ class Meta: unique_together = ("related_advisories_id", "issue_type") +class AdvisoryToDoV2QuerySet(models.QuerySet): + + def exclude_stale(self): + return self.exclude(is_todo_stale=True) + + class AdvisoryToDoV2(models.Model): """Track the TODOs for advisory/ies that need to be addressed.""" + todo_id = models.UUIDField( + primary_key=True, + default=uuid.uuid4, + editable=False, + ) + # Since we can not make advisories field (M2M field) unique # (see https://code.djangoproject.com/ticket/702), we use related_advisories_id # to avoid creating duplicate issue for same set of advisories, @@ -2539,6 +2551,20 @@ class AdvisoryToDoV2(models.Model): help_text="Advisory/ies where this TODO is applicable.", ) + alias = models.CharField( + max_length=50, + db_index=True, + blank=False, + null=False, + help_text="Alias associated with TODO advisories", + ) + + advisories_count = models.IntegerField( + help_text="Number of advisory associated with this TODO.", + default=1, + db_index=True, + ) + issue_type = models.CharField( max_length=50, choices=ISSUE_TYPE_CHOICES, @@ -2573,6 +2599,22 @@ class AdvisoryToDoV2(models.Model): help_text="Additional detail on how this TODO was resolved.", ) + oldest_advisory_date = models.DateTimeField( + null=True, + blank=True, + db_index=True, + help_text="Timestamp indicating when the oldest advisory was published, used for triaging TODOs.", + ) + + is_todo_stale = models.BooleanField( + null=False, + db_index=True, + default=False, + help_text="TODOs are marked stale if associate advisory is no longer the latest version of the advisory.", + ) + + objects = AdvisoryToDoV2QuerySet.as_manager() + class Meta: unique_together = ("related_advisories_id", "issue_type") @@ -2958,6 +3000,12 @@ def latest_advisories_for_purl(self, purl): qs = self.filter(id__in=Subquery(adv_ids)) return qs.latest_per_avid() + def todo_excluded(self): + """Exclude advisory ineligible for ToDo computation.""" + from vulnerabilities.importers import TODO_EXCLUDED_PIPELINES + + return self.exclude(datasource_id__in=TODO_EXCLUDED_PIPELINES) + class AdvisorySet(models.Model): diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index 499f53331..51728b631 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -278,6 +278,11 @@ class VulnerableCodeBaseImporterPipelineV2(VulnerableCodePipeline): ignorable_versions = [] precedence = 0 + # Set this to True if computing fixed/affected package ToDo is not fruitful for this source. + # An example of such advisory would be pipeline dedicated to collecting issues, + # pull requests, commit messages, EPSS, exploits, etc. + exclude_from_package_todo = False + # Control how often progress log is shown (range: 1–100, higher value = less frequent log) progress_step = 10 diff --git a/vulnerabilities/pipelines/v2_importers/__init__.py b/vulnerabilities/pipelines/v2_importers/__init__.py new file mode 100644 index 000000000..20854f2ad --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# diff --git a/vulnerabilities/pipelines/v2_importers/aosp_importer.py b/vulnerabilities/pipelines/v2_importers/aosp_importer.py index 23bcda86f..1abe91776 100644 --- a/vulnerabilities/pipelines/v2_importers/aosp_importer.py +++ b/vulnerabilities/pipelines/v2_importers/aosp_importer.py @@ -32,6 +32,7 @@ class AospImporterPipeline(VulnerableCodeBaseImporterPipelineV2): license_url = "https://github.com/quarkslab/aosp_dataset/blob/master/LICENSE" precedence = 200 + exclude_from_package_todo = True @classmethod def steps(cls): diff --git a/vulnerabilities/pipelines/v2_importers/epss_importer_v2.py b/vulnerabilities/pipelines/v2_importers/epss_importer_v2.py index 6f8adc6d1..007341d0c 100644 --- a/vulnerabilities/pipelines/v2_importers/epss_importer_v2.py +++ b/vulnerabilities/pipelines/v2_importers/epss_importer_v2.py @@ -30,6 +30,8 @@ class EPSSImporterPipeline(VulnerableCodeBaseImporterPipelineV2): spdx_license_expression = "unknown" importer_name = "EPSS Importer" + exclude_from_package_todo = True + precedence = 200 def advisories_count(self): diff --git a/vulnerabilities/pipelines/v2_importers/nvd_importer.py b/vulnerabilities/pipelines/v2_importers/nvd_importer.py index d689aaa05..7c5faf73e 100644 --- a/vulnerabilities/pipelines/v2_importers/nvd_importer.py +++ b/vulnerabilities/pipelines/v2_importers/nvd_importer.py @@ -71,6 +71,8 @@ class NVDImporterPipeline(VulnerableCodeBaseImporterPipelineV2): MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. """ + exclude_from_package_todo = True + precedence = 100 @classmethod diff --git a/vulnerabilities/pipelines/v2_importers/project_kb_msr2019_importer.py b/vulnerabilities/pipelines/v2_importers/project_kb_msr2019_importer.py index 1a4411acd..269c92f71 100644 --- a/vulnerabilities/pipelines/v2_importers/project_kb_msr2019_importer.py +++ b/vulnerabilities/pipelines/v2_importers/project_kb_msr2019_importer.py @@ -30,6 +30,8 @@ class ProjectKBMSR2019Pipeline(VulnerableCodeBaseImporterPipelineV2): license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt" repo_url = "git+https://github.com/SAP/project-kb" + exclude_from_package_todo = True + precedence = 200 @classmethod diff --git a/vulnerabilities/pipelines/v2_importers/project_kb_statements_importer.py b/vulnerabilities/pipelines/v2_importers/project_kb_statements_importer.py index 0c1c8e05d..a4200cedb 100644 --- a/vulnerabilities/pipelines/v2_importers/project_kb_statements_importer.py +++ b/vulnerabilities/pipelines/v2_importers/project_kb_statements_importer.py @@ -37,6 +37,8 @@ class ProjectKBStatementsPipeline(VulnerableCodeBaseImporterPipelineV2): license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt" repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data" + exclude_from_package_todo = True + precedence = 200 @classmethod diff --git a/vulnerabilities/pipelines/v2_importers/suse_score_importer.py b/vulnerabilities/pipelines/v2_importers/suse_score_importer.py index 92a534ddc..299dcb256 100644 --- a/vulnerabilities/pipelines/v2_importers/suse_score_importer.py +++ b/vulnerabilities/pipelines/v2_importers/suse_score_importer.py @@ -23,6 +23,8 @@ class SUSESeverityScoreImporterPipeline(VulnerableCodeBaseImporterPipelineV2): pipeline_id = "suse_importer_v2" url = "https://ftp.suse.com/pub/projects/security/yaml/suse-cvss-scores.yaml" + exclude_from_package_todo = True + @classmethod def steps(cls): return ( diff --git a/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py b/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py index 981f10e92..e719cf595 100644 --- a/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py +++ b/vulnerabilities/pipelines/v2_improvers/compute_advisory_todo.py @@ -9,16 +9,25 @@ import json +from collections import Counter +from collections import defaultdict +from itertools import chain from aboutcode.pipeline import LoopProgress +from django.db.models import Prefetch from django.utils import timezone +from packageurl import PackageURL +from vulnerabilities.importer import AdvisoryDataV2 from vulnerabilities.models import AdvisoryAlias from vulnerabilities.models import AdvisoryToDoV2 from vulnerabilities.models import AdvisoryV2 from vulnerabilities.models import ToDoRelatedAdvisoryV2 from vulnerabilities.pipelines import VulnerableCodePipeline from vulnerabilities.pipes.advisory import advisories_checksum +from vulnerabilities.utils import canonical_value +from vulnerabilities.utils import normalize_text +from vulnerabilities.utils import sha256_digest class ComputeToDo(VulnerableCodePipeline): @@ -36,8 +45,15 @@ def steps(cls): def compute_individual_advisory_todo(self): """Create ToDos for missing summary, affected and fixed packages.""" - advisories = AdvisoryV2.objects.all().prefetch_related( - "impacted_packages", + advisories = ( + AdvisoryV2.objects.todo_excluded() + .latest_per_avid() + .exclude(advisory_todos__issue_type="MISSING_SUMMARY") + .exclude(advisory_todos__issue_type="MISSING_AFFECTED_PACKAGE") + .exclude(advisory_todos__issue_type="MISSING_FIXED_BY_PACKAGE") + .prefetch_related( + "impacted_packages", + ) ) advisories_count = advisories.count() advisory_relation_to_create = {} @@ -51,7 +67,7 @@ def compute_individual_advisory_todo(self): progress = LoopProgress( total_iterations=advisories_count, logger=self.log, - progress_step=1, + progress_step=10, ) for advisory in progress.iter(advisories.iterator(chunk_size=5000)): advisory_todo_id = advisories_checksum(advisories=advisory) @@ -93,37 +109,142 @@ def detect_conflicting_advisories(self): Create ToDos for advisories with conflicting opinions on fixed and affected package versions for a vulnerability. """ - aliases = AdvisoryAlias.objects.filter(alias__istartswith="cve") - aliases_count = aliases.count() advisory_relation_to_create = {} + unfurled_purl_summary = Counter() todo_to_create = [] new_todos_count = 0 - batch_size = 5000 + batch_size = 1000 + total_count_conflicting_advisory = 0 + total_package_conflict_count = 0 + total_uncomparable_advisory_count = 0 + total_successfully_compared_advisory_count = 0 + existing_todo_ids = set( + AdvisoryToDoV2.objects.values_list("related_advisories_id", flat=True) + ) - self.log(f"Cross validating advisory affected and fixed package for {aliases_count} CVEs") + advisory_qs = ( + AdvisoryV2.objects.exclude( + advisory_todos__issue_type="MISSING_AFFECTED_AND_FIXED_BY_PACKAGES" + ) + .exclude(advisory_todos__issue_type="MISSING_AFFECTED_PACKAGE") + .exclude(advisory_todos__issue_type="MISSING_FIXED_BY_PACKAGE") + .todo_excluded() + .latest_per_avid() + .distinct() + .prefetch_related( + "impacted_packages", + "impacted_packages__affecting_packages", + "impacted_packages__fixed_by_packages", + ) + ) + cve_aliases = AdvisoryAlias.objects.filter(alias__istartswith="cve").prefetch_related( + Prefetch("advisories", queryset=advisory_qs, to_attr="filtered_advisories") + ) + non_cve_aliases = AdvisoryAlias.objects.exclude(alias__istartswith="cve").prefetch_related( + Prefetch("advisories", queryset=advisory_qs, to_attr="filtered_advisories") + ) + aliases_count = cve_aliases.count() + non_cve_aliases.count() progress = LoopProgress( total_iterations=aliases_count, logger=self.log, - progress_step=1, + progress_step=5, + ) + self.log(f"Detect conflicting affected and fixed package for {aliases_count} aliases.") + aliases = chain( + cve_aliases.iterator(chunk_size=50), + non_cve_aliases.iterator(chunk_size=50), ) - for alias in progress.iter(aliases.iterator(chunk_size=2000)): - advisories = ( - alias.advisories.exclude( - advisory_todos__issue_type="MISSING_AFFECTED_AND_FIXED_BY_PACKAGES" + for alias in progress.iter(aliases): + adv_purl_map = defaultdict(set) + purl_adv_map = defaultdict( + lambda: defaultdict( + lambda: { + "affected": set(), + "fixed": set(), + "impact_count": 0, + } ) - .distinct() - .prefetch_related( - "impacted_packages", + ) + unfurled_base_purls = set() + advisories_with_unfurled_purls = set() + + advisories_with_common_alias = alias.filtered_advisories or [] + known_advisory_ids = [a.id for a in advisories_with_common_alias] + adv_with_alias_in_adv_id = advisory_qs.filter(advisory_id=alias.alias).exclude( + id__in=known_advisory_ids + ) + if not advisories_with_common_alias and not adv_with_alias_in_adv_id.exists(): + continue + + advisories_with_common_alias.extend(adv_with_alias_in_adv_id) + if len(advisories_with_common_alias) < 2: + total_successfully_compared_advisory_count += 1 + continue + + for advisory in advisories_with_common_alias: + for impact in advisory.impacted_packages.all(): + base_purl = impact.base_purl + adv_purl_map[advisory.avid].add(base_purl) + advisory_map = purl_adv_map[base_purl][advisory.avid] + advisory_map["affected"].update( + p.version for p in impact.affecting_packages.all() + ) + advisory_map["fixed"].update(p.version for p in impact.fixed_by_packages.all()) + advisory_map["impact_count"] += 1 + + if not impact.last_successful_range_unfurl_at: + unfurled_base_purls.add(base_purl) + advisories_with_unfurled_purls.add(advisory.avid) + + # keep only PURLs linked to at least 2 advisories + comparable_purl_map = { + purl: value for purl, value in purl_adv_map.items() if len(value) >= 2 + } + + uncomparable_purls = {purl for purl, avids in purl_adv_map.items() if len(avids) < 2} + + comparable_adv_map = { + adv: (purls - uncomparable_purls) + for adv, purls in adv_purl_map.items() + if (purls - uncomparable_purls) + } + + avids_with_common_alias_and_purl = set(comparable_adv_map) + + advisory_group = { + adv.avid: adv + for adv in advisories_with_common_alias + if adv.avid in avids_with_common_alias_and_purl + } + + if not len(advisory_group) > 1: + continue + # if any eligible PURL is not unfurled, skip + if set(comparable_purl_map) & unfurled_base_purls: + unfurled_purl_summary.update( + PackageURL.from_string(up).type for up in unfurled_base_purls + ) + + total_uncomparable_advisory_count += len(advisories_with_unfurled_purls) + continue + + package_conflict_count, count_conflicting_advisory = ( + check_conflicting_affected_and_fixed_by_packages_for_alias( + purl_adv_map=comparable_purl_map, + alias=alias, + advisories=advisory_group, + todo_to_create=todo_to_create, + advisory_relation_to_create=advisory_relation_to_create, + existing_todo_ids=existing_todo_ids, ) ) - check_conflicting_affected_and_fixed_by_packages_for_alias( - advisories=advisories, - cve=alias, - todo_to_create=todo_to_create, - advisory_relation_to_create=advisory_relation_to_create, + total_successfully_compared_advisory_count += len(advisory_group) - len( + advisories_with_unfurled_purls ) + total_package_conflict_count += package_conflict_count + total_count_conflicting_advisory += count_conflicting_advisory if len(todo_to_create) > batch_size: new_todos_count += bulk_create_with_m2m( @@ -141,8 +262,14 @@ def detect_conflicting_advisories(self): ) self.log( - f"Successfully created {new_todos_count} ToDos for conflicting affected and fixed packages" + f"Successfully compared {total_successfully_compared_advisory_count} advisories, created {new_todos_count} new ToDos for {total_package_conflict_count} " + f"conflicting affected and fixed packages related to {total_count_conflicting_advisory} advisories." + ) + self.log( + f"Could not compare version range for {total_uncomparable_advisory_count} advisory " + "containing unfurled packages." ) + self.log(f"Summary of unfurled PURLs: \n {unfurled_purl_summary}") def check_missing_summary( @@ -151,10 +278,15 @@ def check_missing_summary( todo_to_create, advisory_relation_to_create, ): + alias = advisory.datasource_id.rsplit("/", 1)[-1] + oldest_advisory_date = advisory.date_published or advisory.date_collected if not advisory.summary: todo = AdvisoryToDoV2( related_advisories_id=todo_id, issue_type="MISSING_SUMMARY", + alias=alias, + advisories_count=1, + oldest_advisory_date=oldest_advisory_date, ) advisory_relation_to_create[todo_id] = [advisory] todo_to_create.append(todo) @@ -198,98 +330,111 @@ def check_missing_affected_and_fixed_by_packages( elif not has_fixed_package: issue_type = "MISSING_FIXED_BY_PACKAGE" + alias = advisory.datasource_id.rsplit("/", 1)[-1] + oldest_advisory_date = advisory.date_published or advisory.date_collected if issue_type: todo = AdvisoryToDoV2( related_advisories_id=todo_id, issue_type=issue_type, + alias=alias, + advisories_count=1, + oldest_advisory_date=oldest_advisory_date, ) todo_to_create.append(todo) advisory_relation_to_create[todo_id] = [advisory] +def compute_version_range_disagreement(adv_map): + """Compute differences in affected and fixed version sets across advisories.""" + + affected_sets = [v["affected"] for v in adv_map.values()] + fixed_sets = [v["fixed"] for v in adv_map.values()] + + affected_union = set().union(*affected_sets) + affected_intersection = set.intersection(*affected_sets) + + fixed_union = set().union(*fixed_sets) + fixed_intersection = set.intersection(*fixed_sets) + + return { + "affected_union": affected_union, + "affected_intersection": affected_intersection, + "affected_disagreement": affected_union - affected_intersection, + "fixed_union": fixed_union, + "fixed_intersection": fixed_intersection, + "fixed_disagreement": fixed_union - fixed_intersection, + } + + def check_conflicting_affected_and_fixed_by_packages_for_alias( + purl_adv_map, + alias, advisories, - cve, todo_to_create, advisory_relation_to_create, + existing_todo_ids, ): """ Add appropriate AdvisoryToDo for conflicting affected/fixed packages. - Compute the comparison matrix for the given set of advisories. Iterate through each advisory - and compute and store fixed versionsrange and affected versionrange for each advisory, - keyed by purl. + Compute the comparison matrix for the given set of advisories. Iterate through each purl_adv_map + and compute and store version range disagreement for conflicting affected/fixed range keyed by PURL. - Use the matrix to determine conflicts in affected/fixed versions for each purl. If for any purl - there is more than one set of fixed versionrange or more than one set of affected versionrange, - it means the advisories have conflicting opinions on the fixed or affected packages. + Also compute partial curation advisory by merging non conflicting component of conflicting in advisory. + Conflict package details, partial curation advisory is stored in issue_detail field. - Example of comparison matrix: + Example of conflicting_package_details: { - "pkg:npm/foo/bar": { - "affected": { - Advisory1: frozenset(VersionRange1, VersionRange2), - Advisory2: frozenset(...), - }, - "fixed": { - Advisory1: frozenset(VersionRange1, VersionRange2), - Advisory2: frozenset(...), - }, + "pkg:maven/org.apache.struts/struts2-core": { + "avids": [ + "github_osv_importer_v2/GHSA-mwrx-hx6x-3hhv", + "gitlab_importer_v2/maven/org.apache.struts/struts2-core/CVE-2012-0838" + ], + "affected_union": {"2.1.8.1", "2.0.8", "2.1.2", "2.0.5", "2.0.11", "2.2.1.1", "2.2.3"}, + "affected_intersection": {"2.1.8.1", "2.0.8", "2.1.2", "2.0.5", "2.0.11", "2.2.3"}, + "affected_disagreement": {"2.2.1.1"}, + "fixed_union": {"2.2.3.1"}, + "fixed_intersection": {"2.2.3.1"}, + "fixed_disagreement": set() }, "pkg:pypi/foobar": { - "affected": { - Advisory1: frozenset(...), - Advisory2: frozenset(...), - }, - "fixed": { - Advisory1: frozenset(...), - Advisory2: frozenset(...), - }, + "avids": [ + "pypa_importer_v2/PYSEC-xxxx-18", + "pysec_importer_v2/PYSEC-xxxx-18" + ], + "affected_union": {"2.1.8.1", "2.0.8"}, + "affected_intersection": {"2.1.8.1", "2.0.8"}, + "affected_disagreement": set(), + "fixed_union": {"3.1", "3.0"}, + "fixed_intersection": {"3.1"}, + "fixed_disagreement": {"3.0"}, }, ... } """ - matrix = {} - for advisory in advisories: - advisory_id = advisory.unique_content_id - for impacted in advisory.impacted_packages.all() or []: - affected_purl = impacted.base_purl - - initialize_sub_matrix( - matrix=matrix, - affected_purl=affected_purl, - advisory=advisory, - ) - - if fixed_version_range := impacted.fixed_vers: - matrix[affected_purl]["fixed"][advisory_id].add(fixed_version_range) - - if affecting_version_range := impacted.affecting_vers: - matrix[affected_purl]["affected"][advisory_id].add(affecting_version_range) + conflicting_package_details = {} has_conflicting_affected_packages = False has_conflicting_fixed_package = False - messages = [] - for purl, board in matrix.items(): - fixed = board.get("fixed", {}).values() - impacted = board.get("affected", {}).values() - - unique_set_of_affected_vers = {frozenset(vers) for vers in impacted} - unique_set_of_fixed_vers = {frozenset(vers) for vers in fixed} + conflicting_advisories = set() + for purl, adv_map in purl_adv_map.items(): + result = compute_version_range_disagreement(adv_map) + if not (result["fixed_disagreement"] or result["affected_disagreement"]): + continue - if len(unique_set_of_affected_vers) > 1: - has_conflicting_affected_packages = True - messages.append( - f"{cve}: {purl} with conflicting affected versions {unique_set_of_affected_vers}" - ) - if len(unique_set_of_fixed_vers) > 1: + if result["fixed_disagreement"]: has_conflicting_fixed_package = True - messages.append( - f"{cve}: {purl} with conflicting fixed version {unique_set_of_fixed_vers}" - ) + if result["affected_disagreement"]: + has_conflicting_affected_packages = True + + conflicting_package_details[purl] = { + "avids": list(adv_map.keys()), + } + conflicting_advisories.update([advisories[avid] for avid in adv_map]) + conflicting_package_details[purl].update(result) if not has_conflicting_affected_packages and not has_conflicting_fixed_package: - return + return 0, 0 issue_type = "CONFLICTING_AFFECTED_AND_FIXED_BY_PACKAGES" if not has_conflicting_fixed_package: @@ -297,33 +442,160 @@ def check_conflicting_affected_and_fixed_by_packages_for_alias( elif not has_conflicting_affected_packages: issue_type = "CONFLICTING_FIXED_BY_PACKAGES" + conflicting_advisories = list(conflicting_advisories) + conflicting_avids = [avd.avid for avd in conflicting_advisories] + non_conflicting_purl_avid_map = get_best_impact_for_non_conflicting_purls( + purl_adv_map, + conflicting_package_details, + conflicting_avids, + ) + partial_merged_advisory = merged_advisory(conflicting_advisories, non_conflicting_purl_avid_map) + conflict_checksum = sha256_digest(canonical_value(conflicting_package_details)) + issue_detail = { - "Conflict summary": messages, - "Conflict matrix": matrix, + "alias": alias.alias, + "conflict_checksum": conflict_checksum, + "conflict_details": conflicting_package_details, + "partial_merged_advisory": partial_merged_advisory, } - todo_id = advisories_checksum(advisories) + todo_id = advisories_checksum(conflicting_advisories) + + if todo_id in existing_todo_ids: + return 0, 0 + + existing_todo_ids.add(todo_id) + conflicting_advisories_count = len(conflicting_advisories) + conflicting_package_count = len(conflicting_package_details) + + date_published = min( + (a.date_published for a in conflicting_advisories if a.date_published), + default=None, + ) + date_collected = min( + (a.date_collected for a in conflicting_advisories if a.date_collected), + default=None, + ) todo = AdvisoryToDoV2( related_advisories_id=todo_id, issue_type=issue_type, issue_detail=json.dumps(issue_detail, default=list), + alias=alias, + advisories_count=conflicting_advisories_count, + oldest_advisory_date=date_published or date_collected, ) todo_to_create.append(todo) - advisory_relation_to_create[todo_id] = list(advisories) + advisory_relation_to_create[todo_id] = conflicting_advisories + return conflicting_package_count, conflicting_advisories_count -def initialize_sub_matrix(matrix, affected_purl, advisory): - advisory_id = advisory.unique_content_id - if affected_purl not in matrix: - matrix[affected_purl] = { - "affected": {advisory_id: set()}, - "fixed": {advisory_id: set()}, - } - else: - if advisory not in matrix[affected_purl]["affected"]: - matrix[affected_purl]["affected"][advisory_id] = set() - if advisory not in matrix[affected_purl]["fixed"]: - matrix[affected_purl]["fixed"][advisory_id] = set() + +def get_best_impact_for_non_conflicting_purls( + purl_adv_map, conflicting_package_details, conflicting_avids +): + """ + Return PURL - AVID mapping for non-conflicting packages. + + Select only one advisory per PURL based on maximum impact package count. + """ + best_purl_avid_map = {} + for purl, advs in purl_adv_map.items(): + if purl in conflicting_package_details: + continue + + candidates = [ + (avid, values["impact_count"]) + for avid, values in advs.items() + if avid in conflicting_avids + ] + + if candidates: + best_purl_avid_map[purl] = max(candidates, key=lambda x: x[1]) + return best_purl_avid_map + + +def merged_advisory(advisories, non_conflicting_purl_avid_map): + """Merge multiple advisory to one removing any duplicates or conflicting package ranges.""" + merged_adv = { + "aliases": set(), + "summary": "", + "affected_packages": [], + "references": [], + "patches": [], + "severities": [], + "weaknesses": set(), + } + + seen_affected = set() + seen_references = set() + seen_patches = set() + seen_severities = set() + seen_summaries = {} + merged_summary = [] + + for adv in advisories: + adv_dict = adv.to_advisory_data().to_dict() + + merged_adv["aliases"].update(adv_dict.get("aliases", [])) + merged_adv["weaknesses"].update(adv_dict.get("weaknesses", [])) + + if summary := adv_dict.get("summary", "").strip(): + key = normalize_text(summary) + entry = seen_summaries.setdefault(key, [summary, []]) + entry[1].append(adv.avid) + + for ref in adv_dict.get("references", []): + update_advisory_item( + item=ref, + seen_item=seen_references, + updatable=merged_adv["references"], + ) + + for patch in adv_dict.get("patches", []): + update_advisory_item( + item=patch, + seen_item=seen_patches, + updatable=merged_adv["patches"], + ) + + for sev in adv_dict.get("severities", []): + update_advisory_item( + item=sev, + seen_item=seen_severities, + updatable=merged_adv["severities"], + ) + + for affected in adv_dict.get("affected_packages", []): + base_purl = PackageURL(**affected["package"]).to_string() + if ( + base_purl in non_conflicting_purl_avid_map + and non_conflicting_purl_avid_map[base_purl][0] == adv.avid + ): + update_advisory_item( + item=affected, + seen_item=seen_affected, + updatable=merged_adv["affected_packages"], + ) + + for summary, avids in seen_summaries.values(): + merged_summary.append(f"{tuple(avids)}: {summary}") + + merged_adv["summary"] = "\n".join(merged_summary) + merged_adv["aliases"] = list(merged_adv["aliases"]) + merged_adv["weaknesses"] = list(merged_adv["weaknesses"]) + + merged_adv["advisory_id"] = "PLACEHOLDER_AVID" + merged_adv["date_published"] = "" + merged_adv = AdvisoryDataV2.from_dict(merged_adv).to_dict() + + return merged_adv + + +def update_advisory_item(item, seen_item, updatable): + digest = hash(canonical_value(item)) + if digest not in seen_item: + seen_item.add(digest) + updatable.append(item) def bulk_create_with_m2m(todos, advisories, logger): diff --git a/vulnerabilities/pipes/vcs_collector_utils.py b/vulnerabilities/pipes/vcs_collector_utils.py index 54db73c1a..80c0be1d0 100644 --- a/vulnerabilities/pipes/vcs_collector_utils.py +++ b/vulnerabilities/pipes/vcs_collector_utils.py @@ -30,6 +30,8 @@ class CollectVCSFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2): Pipeline to collect fix commits from any git repository. """ + exclude_from_package_todo = True + repo_url: str patterns: list[str] = [ r"\bCVE-\d{4}-\d{4,19}\b", diff --git a/vulnerabilities/tests/pipelines/test_compute_advisory_todo_v2.py b/vulnerabilities/tests/pipelines/v2_improvers/test_compute_advisory_todo_v2.py similarity index 56% rename from vulnerabilities/tests/pipelines/test_compute_advisory_todo_v2.py rename to vulnerabilities/tests/pipelines/v2_improvers/test_compute_advisory_todo_v2.py index e55bf5f6a..4c9a8f275 100644 --- a/vulnerabilities/tests/pipelines/test_compute_advisory_todo_v2.py +++ b/vulnerabilities/tests/pipelines/v2_improvers/test_compute_advisory_todo_v2.py @@ -16,17 +16,20 @@ from vulnerabilities.importer import AdvisoryDataV2 from vulnerabilities.importer import AffectedPackageV2 from vulnerabilities.importer import ReferenceV2 -from vulnerabilities.models import AdvisoryAlias from vulnerabilities.models import AdvisoryToDoV2 from vulnerabilities.models import AdvisoryV2 from vulnerabilities.models import ImpactedPackage from vulnerabilities.pipelines.v2_improvers.compute_advisory_todo import ComputeToDo +from vulnerabilities.pipes.advisory import insert_advisory_v2 +from vulnerabilities.tests.pipelines import TestLogger class TestComputeToDo(TestCase): def setUp(self): + self.log = TestLogger() self.advisory_data1 = AdvisoryDataV2( advisory_id="test_id", + aliases=["CVE-000-000"], summary="Test summary", affected_packages=[ AffectedPackageV2( @@ -67,6 +70,7 @@ def setUp(self): self.advisory_data4 = AdvisoryDataV2( advisory_id="test_id_3", + aliases=["CVE-000-000"], summary="Test summary", affected_packages=[ AffectedPackageV2( @@ -80,23 +84,14 @@ def setUp(self): ) def test_advisory_todo_missing_summary(self): - date = datetime.now() - adv = AdvisoryV2.objects.create( - unique_content_id="test_id", - url=self.advisory_data1.url, - summary="", - date_collected=date, - advisory_id="test_id", - avid="test_pipeline/test_id", - datasource_id="test_pipeline", + insert_advisory_v2( + advisory=self.advisory_data1, + pipeline_id="test_pipeline1", + logger=self.log.write, ) - for pkg in self.advisory_data1.affected_packages: - ImpactedPackage.objects.create( - advisory=adv, - base_purl=pkg.package, - affecting_vers=pkg.affected_version_range, - fixed_vers=pkg.fixed_version_range, - ) + adv = AdvisoryV2.objects.first() + adv.summary = "" + adv.save() pipeline = ComputeToDo() pipeline.execute() @@ -106,23 +101,11 @@ def test_advisory_todo_missing_summary(self): self.assertEqual(1, todo.advisories.count()) def test_advisory_todo_missing_fixed(self): - date = datetime.now() - adv = AdvisoryV2.objects.create( - unique_content_id="test_id", - url=self.advisory_data2.url, - summary=self.advisory_data2.summary, - date_collected=date, - advisory_id="test_id", - avid="test_pipeline/test_id", - datasource_id="test_pipeline", + insert_advisory_v2( + advisory=self.advisory_data2, + pipeline_id="test_pipeline1", + logger=self.log.write, ) - for pkg in self.advisory_data2.affected_packages: - ImpactedPackage.objects.create( - advisory=adv, - base_purl=pkg.package, - affecting_vers=pkg.affected_version_range, - fixed_vers=pkg.fixed_version_range or "", - ) pipeline = ComputeToDo() pipeline.execute() @@ -132,23 +115,11 @@ def test_advisory_todo_missing_fixed(self): self.assertEqual(1, todo.advisories.count()) def test_advisory_todo_missing_affected(self): - date = datetime.now() - adv = AdvisoryV2.objects.create( - unique_content_id="test_id", - url=self.advisory_data3.url, - summary=self.advisory_data3.summary, - date_collected=date, - advisory_id="test_id", - avid="test_pipeline/test_id", - datasource_id="test_pipeline", + insert_advisory_v2( + advisory=self.advisory_data3, + pipeline_id="test_pipeline1", + logger=self.log.write, ) - for pkg in self.advisory_data3.affected_packages: - ImpactedPackage.objects.create( - advisory=adv, - base_purl=pkg.package, - affecting_vers=pkg.affected_version_range or "", - fixed_vers=pkg.fixed_version_range, - ) pipeline = ComputeToDo() pipeline.execute() @@ -158,52 +129,31 @@ def test_advisory_todo_missing_affected(self): self.assertEqual(1, todo.advisories.count()) def test_advisory_todo_conflicting_fixed_affected(self): - alias = AdvisoryAlias.objects.create(alias="CVE-0000-0000") - date = datetime.now() - adv1 = AdvisoryV2.objects.create( - unique_content_id="test_id1", - url=self.advisory_data1.url, - summary=self.advisory_data1.summary, - date_collected=date, - advisory_id="test_id", - avid="test_pipeline/test_id_2", - datasource_id="test_pipeline", + insert_advisory_v2( + advisory=self.advisory_data1, + pipeline_id="test_pipeline1", + logger=self.log.write, ) - for pkg in self.advisory_data1.affected_packages: - ImpactedPackage.objects.create( - advisory=adv1, - base_purl=pkg.package, - affecting_vers=pkg.affected_version_range or "", - fixed_vers=pkg.fixed_version_range or "", - ) - adv1.aliases.add(alias) - adv2 = AdvisoryV2.objects.create( - unique_content_id="test_id2", - url=self.advisory_data4.url, - summary=self.advisory_data4.summary, - date_collected=date, - advisory_id="test_id", - avid="test_pipeline/test_id_2", - datasource_id="test_pipeline", + insert_advisory_v2( + advisory=self.advisory_data4, + pipeline_id="test_pipeline2", + logger=self.log.write, ) - for pkg in self.advisory_data4.affected_packages: - ImpactedPackage.objects.create( - advisory=adv2, - base_purl=pkg.package, - affecting_vers=pkg.affected_version_range or "", - fixed_vers=pkg.fixed_version_range or "", - ) - adv2.aliases.add(alias) + for imp in ImpactedPackage.objects.all(): + imp.last_successful_range_unfurl_at = datetime.now() + imp.save() self.assertEqual(0, AdvisoryToDoV2.objects.count()) pipeline = ComputeToDo() pipeline.execute() todo = AdvisoryToDoV2.objects.first() + adv = AdvisoryV2.objects.first() self.assertEqual(1, AdvisoryToDoV2.objects.count()) self.assertEqual("CONFLICTING_AFFECTED_AND_FIXED_BY_PACKAGES", todo.issue_type) self.assertIn( - "CVE-0000-0000: pkg:npm/package1 with conflicting fixed version", todo.issue_detail + '"conflict_checksum": "57f32de5f41f137f0e3808535c2d974d54eeeda426c4279e7fb90475d26f0313",', + todo.issue_detail, ) self.assertEqual(2, todo.advisories.count()) - self.assertEqual(todo, adv2.advisory_todos.first()) + self.assertEqual(todo, adv.advisory_todos.first()) diff --git a/vulnerabilities/utils.py b/vulnerabilities/utils.py index e8a13821e..2e618a920 100644 --- a/vulnerabilities/utils.py +++ b/vulnerabilities/utils.py @@ -618,6 +618,29 @@ def normalize_list(lst): return sorted(lst) if lst else [] +def canonical_value(value): + """ + Return a canonical, order independent tuple for hashing/deduplication. + + >>> canonical_value({"b": ["k", "j"], "a": 2}) + (('a', 2), ('b', ('j', 'k'))) + >>> canonical_value([2, 1]) + (1, 2) + """ + if isinstance(value, dict): + return tuple(sorted((k, canonical_value(v)) for k, v in value.items())) + if isinstance(value, (list, set, tuple)): + return tuple(sorted(canonical_value(v) for v in value)) + return value + + +def sha256_digest(normalized_data): + """Return SHA256 digest from normalized data.""" + + normalized_json = json.dumps(normalized_data, separators=(",", ":"), sort_keys=True) + return hashlib.sha256(normalized_json.encode("utf-8")).hexdigest() + + def compute_content_id(advisory_data): """ Compute a unique content_id for an advisory by normalizing its data and hashing it. @@ -696,10 +719,7 @@ def compute_content_id_v2(advisory_data): } normalized_data["url"] = advisory_data.url - normalized_json = json.dumps(normalized_data, separators=(",", ":"), sort_keys=True) - content_id = hashlib.sha256(normalized_json.encode("utf-8")).hexdigest() - - return content_id + return sha256_digest(normalized_data) def create_registry(pipelines):